Last active
September 28, 2021 15:42
-
-
Save fmalina/9dcfb1e0d7b23e0d67ff41617507b88b to your computer and use it in GitHub Desktop.
Dates extractor with tests to extract dates from HTML and text (regex based parser)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Extract dates from HTML | |
test using: python date_extract.py -v | |
""" | |
import calendar | |
import re | |
from datetime import date | |
from django.template.defaultfilters import striptags | |
def get_date(html): | |
""" | |
>>> pass_test = ''' | |
... Dates 8-9 May 2014 lorem ipsum... | |
... Date 30 January 2015 lorem... | |
... Dates 11-12 October 2014 | |
... Published date 4 January 2012... | |
... 12 March 2009... date | |
... 18 th - 20 th October 1999 in ... | |
... Dates of ...: 5 th - 6 th April 2000 | |
... Dates of ...: 19 th -20 th June, 2001 | |
... Date published: March 2015 | |
... Dates of ...: 30.10.00- 2.11.00 | |
... Dates of ...: 02/11/1999 - 05/11/1999 | |
... Dates of ...: 27/01/03 - 29/01/03'''.strip().split('\\n') | |
>>> for test in pass_test: | |
... get_date(test) | |
... | |
datetime.date(2014, 5, 9) | |
datetime.date(2015, 1, 30) | |
datetime.date(2014, 10, 12) | |
datetime.date(2012, 1, 4) | |
datetime.date(2009, 3, 12) | |
datetime.date(1999, 10, 20) | |
datetime.date(2000, 4, 6) | |
datetime.date(2001, 6, 20) | |
datetime.date(2015, 3, 1) | |
datetime.date(2000, 10, 30) | |
datetime.date(1999, 11, 2) | |
datetime.date(2003, 1, 27) | |
>>> fail_test = ['02/11/1960', '55/60/1999', '0.0.2099', 'January 1960'] | |
>>> fail_test = ['bad date: %s ...' % d for d in fail_test] | |
>>> for test in fail_test: | |
... get_date(test) | |
... | |
datetime.date(2011, 2, 1) | |
'' | |
'' | |
'' | |
""" | |
# months = ['january', ... | |
months = [calendar.month_name[x].lower() for x in range(1, 12+1)] | |
day = "(?P<day>(0?[1-9]|[12][0-9]|3[01])) ?((st|nd|rd|th){1})?[\.|/| ]" | |
mth = "(?P<mth>(0?[1-9]|1[012]|" + '|'.join(months)+"){1}) ?,?[\.|/| ]" | |
year = "(?P<year>(19)?9[5-9]|(20)?(0[0-9]|1[0-5]))" | |
opt_day = f'({day})?' | |
pattern = re.compile(opt_day + mth + year) | |
# strip tags, normalize spaces, lowercase body | |
txt = re.sub(r'\s+', ' ', striptags(html[:1200])).lower() | |
match = pattern.search(txt) | |
if match: | |
d, m, y = match.group('day'),\ | |
match.group('mth'),\ | |
match.group('year') | |
d = int(d or 1) | |
if m.isdigit(): | |
m = int(m) | |
else: | |
m = months.index(m)+1 | |
y = int(y) | |
if y < 50: | |
y += 2000 | |
if 50 < y < 100: | |
y += 1900 | |
try: | |
return date(y, m, d) | |
except ValueError: | |
pass | |
return '' | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment