from builtins import object
import re
import datetime
from dateutil.parser import parse
from calendar import IllegalMonthError
[docs]class DateFinder(object):
"""
A helper class to search for dates in text using a series of regular expressions and a parser from \
:py:mod:`dateutil`. Verifies that :py:mod:`dateutil` did not auto-fill missing values in the date. Time \
information will be automatically cleared out, but you can also pass a list of additional regular expression \
patterns (as strings) to define other patterns that should be cleared out before scanning for dates.
:param preprocessing_patterns: Optional list of additional patterns to clear out prior to searching for dates.
:type preprocessing_patterns: list
Usage::
from pewanalytics.text.dates import DateFinder
text = "January 1, 2018 and 02/01/2019 and Mar. 1st 2020"
low_bound = datetime.datetime(2017, 1, 1)
high_bound = datetime.datetime(2021, 1, 1)
>>> finder = DateFinder()
>>> dates = finder.find_dates(text, low_bound, high_bound)
>>> dates
[
(datetime.datetime(2018, 1, 1, 0, 0), 'January 1, 2018 '),
(datetime.datetime(2020, 3, 1, 0, 0), 'Mar. 1st 2020'),
(datetime.datetime(2019, 2, 1, 0, 0), '02/01/2019 ')
]
"""
def __init__(self, preprocessing_patterns=None):
# A generally permissive date regex, also fairly prone to false positives
self.date_regex = re.compile(
r"""(?=((?ix) # case-insensitive, verbose regex
\b # match a word boundary
(?: # match the following three times:
(?: # either
\d+ # a number,
(?:\.|st|nd|rd|th|,)* # followed by a dot, st, nd, rd, a comma, or th (optional)
| # or a month name
(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*)
)
[\s./-]* # followed by a date separator or whitespace (optional)
){3} # do this three times
\b)) """
)
# Always remove times, because those trip up dates
self.preprocessing_patterns = [re.compile("((?:\d\d|\d):[0-9][0-9])")]
# Add in any additional patterns to clear out, as provided by the user
if preprocessing_patterns:
self.preprocessing_patterns.extend(
[re.compile(p) for p in preprocessing_patterns]
)
def _preprocess(self, text):
"""
Return the text without any references to "parts" or specific times.
:param text: A string to be cleaned that contains a date
:type text: str
:return: A cleaned string without additional time info and other boilerplate
:rtype: str
"""
for pattern in self.preprocessing_patterns:
matches = re.findall(pattern, text)
for match in matches:
text = text.replace(match, "")
return text
[docs] def find_dates(self, text, cutoff_date_start, cutoff_date_end):
"""
Return all of the dates (in text form and as datetime) in the text variable that fall within the specified \
window of dates (inclusive).
:param text: The text to scan for dates
:type text: str
:param cutoff_date_start: No dates will be returned if they fall before this date
:type cutoff_date_start: `datetime.date`
:param cutoff_date_end: No dates will be returned if they fall after this date
:type cutoff_date_end: `datetime.date`
:return: A list of tuples containing (datetime object, raw date text)
:rtype: list
"""
final_dates, suspected_dates = [], []
# Start by stripping out time info, which can confuse the date regex and will lead to text snippets that \
# can't be parsed into dates.
text = self._preprocess(text)
# Now find all of the plausible dates in the text. Many of these will be false positives.
date_text_list = re.findall(self.date_regex, text)
for date_raw_text in date_text_list:
suspected_dates.append(date_raw_text)
for date_raw_text in suspected_dates:
# The dateutil parser fills in missing date components with the current day's date, or with a default \
# date you can pass it. This is annoying, because partial dates could show up in bulk, leading to odd \
# clumps of dates on the same date that you parsed. Our solution is to use the two cutoff dates as default \
# dates, parsing each date twice. If the two dates are the same, then the default date was not used. \
# The contents of these dates don't matter much. However, they cannot have a month, day OR year in common, \
# otherwise this won't work.
default_date_1 = datetime.datetime(day=1, year=2020, month=9)
default_date_2 = datetime.datetime(day=2, year=1999, month=6)
try:
datetime_1 = parse(date_raw_text, fuzzy=True, default=default_date_1)
datetime_2 = parse(date_raw_text, fuzzy=True, default=default_date_2)
if (
datetime_1 == datetime_2
and cutoff_date_start <= datetime_1 <= cutoff_date_end
):
date = (datetime_1, date_raw_text)
final_dates.append(date)
except (ValueError, IllegalMonthError):
pass
return list(set(final_dates))