Skip to content

Commit

Permalink
fixed addresses - uses NLTK instead of geograpy
Browse files Browse the repository at this point in the history
  • Loading branch information
0x4f53 committed Nov 13, 2023
1 parent 695e84b commit 5b8da2b
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,28 @@ def read_pdf(pdf):

# python -m spacy download en_core_web_sm
def regional_pii(text):
from geotext import GeoText
place_entity = GeoText(text)

final_output = list(set(place_entity.cities + place_entity.countries))
return final_output
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords

if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
if not nltk.data.find('corpora/words.zip'): nltk.download('words')
stop_words = set(stopwords.words('english'))

words = word_tokenize(text)
tagged_words = pos_tag(words)
named_entities = ne_chunk(tagged_words)

locations = []

for entity in named_entities:
if isinstance(entity, nltk.tree.Tree):
if entity.label() in ['GPE', 'GSP', 'LOCATION', 'FACILITY']:
location_name = ' '.join([word for word, tag in entity.leaves() if word.lower() not in stop_words])
locations.append(location_name)

return list(set(locations))

def keywords_classify_pii(rules, intelligible_text_list):
keys = rules.keys()
Expand Down

0 comments on commit 5b8da2b

Please sign in to comment.