Source code for pewanalytics.text.dates

from builtins import object
import re
import datetime

from dateutil.parser import parse
from calendar import IllegalMonthError


[docs]class DateFinder(object):

    """
    A helper class to search for dates in text using a series of regular expressions and a parser from \
    :py:mod:`dateutil`. Verifies that :py:mod:`dateutil` did not auto-fill missing values in the date. Time \
    information will be automatically cleared out, but you can also pass a list of additional regular expression \
    patterns (as strings) to define other patterns that should be cleared out before scanning for dates.

    :param preprocessing_patterns: Optional list of additional patterns to clear out prior to searching for dates.
    :type preprocessing_patterns: list

    Usage::

        from pewanalytics.text.dates import DateFinder

        text = "January 1, 2018 and 02/01/2019 and Mar. 1st 2020"
        low_bound = datetime.datetime(2017, 1, 1)
        high_bound = datetime.datetime(2021, 1, 1)

        >>> finder = DateFinder()
        >>> dates = finder.find_dates(text, low_bound, high_bound)
        >>> dates
        [
            (datetime.datetime(2018, 1, 1, 0, 0), 'January 1, 2018 '),
            (datetime.datetime(2020, 3, 1, 0, 0), 'Mar. 1st 2020'),
            (datetime.datetime(2019, 2, 1, 0, 0), '02/01/2019 ')
        ]

    """

    def __init__(self, preprocessing_patterns=None):

        # A generally permissive date regex, also fairly prone to false positives
        self.date_regex = re.compile(
            r"""(?=((?ix)                       # case-insensitive, verbose regex
            \b                                  # match a word boundary
            (?:                                 # match the following three times:
             (?:                                # either
              \d+                               # a number,
              (?:\.|st|nd|rd|th|,)*             # followed by a dot, st, nd, rd, a comma, or th (optional)
              |                                 # or a month name
              (?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*)
             )
             [\s./-]*                           # followed by a date separator or whitespace (optional)
            ){3}                                # do this three times
            \b)) """
        )

        # Always remove times, because those trip up dates
        self.preprocessing_patterns = [re.compile("((?:\d\d|\d):[0-9][0-9])")]
        # Add in any additional patterns to clear out, as provided by the user
        if preprocessing_patterns:
            self.preprocessing_patterns.extend(
                [re.compile(p) for p in preprocessing_patterns]
            )

    def _preprocess(self, text):

        """
        Return the text without any references to "parts" or specific times.

        :param text: A string to be cleaned that contains a date
        :type text: str
        :return: A cleaned string without additional time info and other boilerplate
        :rtype: str
        """

        for pattern in self.preprocessing_patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                text = text.replace(match, "")

        return text

[docs]    def find_dates(self, text, cutoff_date_start, cutoff_date_end):

        """
        Return all of the dates (in text form and as datetime) in the text variable that fall within the specified \
        window of dates (inclusive).

        :param text: The text to scan for dates
        :type text: str
        :param cutoff_date_start: No dates will be returned if they fall before this date
        :type cutoff_date_start: `datetime.date`
        :param cutoff_date_end: No dates will be returned if they fall after this date
        :type cutoff_date_end: `datetime.date`
        :return: A list of tuples containing (datetime object, raw date text)
        :rtype: list
        """

        final_dates, suspected_dates = [], []

        # Start by stripping out time info, which can confuse the date regex and will lead to text snippets that \
        # can't be parsed into dates.
        text = self._preprocess(text)

        # Now find all of the plausible dates in the text. Many of these will be false positives.
        date_text_list = re.findall(self.date_regex, text)
        for date_raw_text in date_text_list:
            suspected_dates.append(date_raw_text)

        for date_raw_text in suspected_dates:

            # The dateutil parser fills in missing date components with the current day's date, or with a default \
            # date you can pass it. This is annoying, because partial dates could show up in bulk, leading to odd \
            # clumps of dates on the same date that you parsed. Our solution is to use the two cutoff dates as default \
            # dates, parsing each date twice. If the two dates are the same, then the default date was not used. \
            # The contents of these dates don't matter much. However, they cannot have a month, day OR year in common, \
            # otherwise this won't work.
            default_date_1 = datetime.datetime(day=1, year=2020, month=9)
            default_date_2 = datetime.datetime(day=2, year=1999, month=6)

            try:
                datetime_1 = parse(date_raw_text, fuzzy=True, default=default_date_1)
                datetime_2 = parse(date_raw_text, fuzzy=True, default=default_date_2)
                if (
                    datetime_1 == datetime_2
                    and cutoff_date_start <= datetime_1 <= cutoff_date_end
                ):
                    date = (datetime_1, date_raw_text)
                    final_dates.append(date)
            except (ValueError, IllegalMonthError):
                pass

        return list(set(final_dates))