Source code for pewanalytics.text.ner
from builtins import str
from builtins import object
import nltk
import spacy
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ne_chunk
from pewtils import decode_text
[docs]class NamedEntityExtractor(object):
"""
A wrapper around NLTK and SpaCy for named entity extraction. May be expanded to include more libraries in the \
future.
:param method: Specify the library to use when extracting methods. Options are 'nltk', 'spacy', 'all'. If \
'all' is selected, both libraries will be used and the union will be returned. (Default='spacy')
:type method: str
Usage::
from pewanalytics.text.ner import NamedEntityExtractor
import nltk
nltk.download("inaugural")
fileid = nltk.corpus.inaugural.fileids()[0]
text = nltk.corpus.inaugural.raw(fileid)
>>> ner = NamedEntityExtractor(method="nltk")
>>> ner.extract(text)
{
'ORGANIZATION': [
'Parent', 'Invisible Hand', 'Great Author', 'House', 'Constitution', 'Senate',
'Human Race', 'Representatives'
],
'PERSON': ['Almighty Being'],
'GPE': ['Heaven', 'United States', 'American']
}
>>> ner = NamedEntityExtractor(method="spacy")
>>> ner.extract(text)
{
'ORGANIZATION': ['House of Representatives', 'Senate', 'Parent of the Human Race'],
'DATE': ['present month', 'every day', '14th day', 'years'],
'ORDINAL': ['first', 'fifth'],
'GPE': ['United States'],
'NORP': ['republican', 'American'],
'LAW': ['Constitution']
}
>>> ner = NamedEntityExtractor(method="all")
>>> ner.extract(text)
{
'ORGANIZATION': [
'Representatives', 'Great Author', 'House', 'Parent', 'House of Representatives',
'Parent of the Human Race', 'Invisible Hand', 'Human Race', 'Senate', 'Constitution'
],
'PERSON': ['Almighty Being'],
'GPE': ['Heaven', 'United States', 'American'],
'DATE': ['every day', 'present month', '14th day', 'years'],
'ORDINAL': ['first', 'fifth'],
'NORP': ['republican', 'American'],
'LAW': ['Constitution']
}
"""
def __init__(self, method="spacy"):
if method not in ["nltk", "spacy", "all"]:
raise Exception("Available methods are: 'nltk', 'spacy', 'all'")
self.method = method
self.type_map = {
"ORG": "ORGANIZATION",
"PER": "PERSON",
"LOC": "LOCATION",
"FAC": "FACILITY",
"VEH": "VEHICLE",
"WEA": "WEAPON",
"GSP": "GPE",
}
[docs] def extract(self, text):
"""
:param text: a string from which to extract named entities
:type text: str
:return: dictionary of entities organized by their category
:rtype: dict
"""
try:
text = str(text)
except Exception as e:
text = decode_text(text)
roots = defaultdict(list)
if self.method in ["nltk", "all"]:
try:
tree = ne_chunk(pos_tag(word_tokenize(text)))
except LookupError:
nltk.download("maxent_ne_chunker")
nltk.download("words")
tree = ne_chunk(pos_tag(word_tokenize(text)), binary=True)
for branch in tree:
if type(branch) is nltk.Tree:
leaf = [" ".join(x[0] for x in branch.leaves())]
key = self.type_map.get(branch.label(), branch.label())
roots[key].extend(leaf)
if self.method in ["spacy", "all"]:
# SpaCy
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
for entity in nlp(text).ents:
entity_text = re.sub(
r"^({})\s".format("|".join(stopwords.words("english"))),
"",
entity.text,
)
key = self.type_map.get(entity.label_, entity.label_)
roots[key].append(entity_text)
return {self.type_map.get(k, k): list(set(v)) for k, v in roots.items()}