Source code for pewanalytics.text.ner

from builtins import str
from builtins import object
import nltk
import spacy
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ne_chunk
from pewtils import decode_text


[docs]class NamedEntityExtractor(object):

    """

    A wrapper around NLTK and SpaCy for named entity extraction. May be expanded to include more libraries in the \
    future.

    :param method: Specify the library to use when extracting methods. Options are 'nltk', 'spacy', 'all'. If \
    'all' is selected, both libraries will be used and the union will be returned. (Default='spacy')
    :type method: str

    Usage::

        from pewanalytics.text.ner import NamedEntityExtractor
        import nltk

        nltk.download("inaugural")
        fileid = nltk.corpus.inaugural.fileids()[0]
        text = nltk.corpus.inaugural.raw(fileid)

        >>> ner = NamedEntityExtractor(method="nltk")
        >>> ner.extract(text)
        {
            'ORGANIZATION': [
                'Parent', 'Invisible Hand', 'Great Author', 'House', 'Constitution', 'Senate',
                'Human Race', 'Representatives'
            ],
            'PERSON': ['Almighty Being'],
            'GPE': ['Heaven', 'United States', 'American']
        }

        >>> ner = NamedEntityExtractor(method="spacy")
        >>> ner.extract(text)
        {
            'ORGANIZATION': ['House of Representatives', 'Senate', 'Parent of the Human Race'],
            'DATE': ['present month', 'every day', '14th day', 'years'],
            'ORDINAL': ['first', 'fifth'],
            'GPE': ['United States'],
            'NORP': ['republican', 'American'],
            'LAW': ['Constitution']
        }

        >>> ner = NamedEntityExtractor(method="all")
        >>> ner.extract(text)
        {
            'ORGANIZATION': [
                'Representatives', 'Great Author', 'House', 'Parent', 'House of Representatives',
                'Parent of the Human Race', 'Invisible Hand', 'Human Race', 'Senate', 'Constitution'
            ],
            'PERSON': ['Almighty Being'],
            'GPE': ['Heaven', 'United States', 'American'],
            'DATE': ['every day', 'present month', '14th day', 'years'],
            'ORDINAL': ['first', 'fifth'],
            'NORP': ['republican', 'American'],
            'LAW': ['Constitution']
        }

    """

    def __init__(self, method="spacy"):

        if method not in ["nltk", "spacy", "all"]:
            raise Exception("Available methods are: 'nltk', 'spacy', 'all'")
        self.method = method

        self.type_map = {
            "ORG": "ORGANIZATION",
            "PER": "PERSON",
            "LOC": "LOCATION",
            "FAC": "FACILITY",
            "VEH": "VEHICLE",
            "WEA": "WEAPON",
            "GSP": "GPE",
        }

[docs]    def extract(self, text):

        """
        :param text: a string from which to extract named entities
        :type text: str
        :return: dictionary of entities organized by their category
        :rtype: dict
        """

        try:
            text = str(text)
        except Exception as e:
            text = decode_text(text)

        roots = defaultdict(list)

        if self.method in ["nltk", "all"]:

            try:
                tree = ne_chunk(pos_tag(word_tokenize(text)))
            except LookupError:
                nltk.download("maxent_ne_chunker")
                nltk.download("words")
                tree = ne_chunk(pos_tag(word_tokenize(text)), binary=True)

            for branch in tree:
                if type(branch) is nltk.Tree:
                    leaf = [" ".join(x[0] for x in branch.leaves())]
                    key = self.type_map.get(branch.label(), branch.label())
                    roots[key].extend(leaf)

        if self.method in ["spacy", "all"]:

            # SpaCy
            try:
                nlp = spacy.load("en_core_web_sm")
            except OSError:
                spacy.cli.download("en_core_web_sm")
                nlp = spacy.load("en_core_web_sm")
            for entity in nlp(text).ents:
                entity_text = re.sub(
                    r"^({})\s".format("|".join(stopwords.words("english"))),
                    "",
                    entity.text,
                )
                key = self.type_map.get(entity.label_, entity.label_)
                roots[key].append(entity_text)

        return {self.type_map.get(k, k): list(set(v)) for k, v in roots.items()}