Skip to content

Instantly share code, notes, and snippets.

@ufukhurriyetoglu
Forked from ryukinix/dateparser_lark.py
Created August 21, 2019 14:34
Show Gist options
  • Save ufukhurriyetoglu/e8e28f1997c5d310f50ef2adac57aa32 to your computer and use it in GitHub Desktop.
Save ufukhurriyetoglu/e8e28f1997c5d310f50ef2adac57aa32 to your computer and use it in GitHub Desktop.
An EBNF grammar based in the Lark Parser designed to parse multivariate date formats. (PT_BR)
#!/usr/bin/env python3
# coding: utf-8
#
# Copyright © Neoway Business Solutions
#
# @project: Diário Oficial
# @author: Manoel Vilela
# @email: [email protected]
#
"""
An EBNF grammar based in the Lark Parser designed to
parse multivariate date formats. Grammar written for Portuguese in Mind.
"""
from lark import exceptions as LarkExceptions
from lark import Lark, InlineTransformer # pip install lark-parser
from datetime import datetime
default_date_format = "%d/%m/%Y"
grammar = '''\
?date: day "de" month "de" year
| day "/" month "/" year
| day "-" month "-" year
| day "." month "." year
| day month year
day: INT
year: INT
month: month_name | month_number
month_name:
| ("janeiro" | "jan") -> jan
| ("fevereiro" | "fev") -> fev
| ("março" | "mar") -> mar
| ("abril" | "abr") -> abr
| ("maio" | "mai") -> mai
| ("junho" | "jun") -> jun
| ("julho" | "jul") -> jul
| ("agosto" | "ago") -> ago
| ("setembro" | "set") -> set
| ("outubro" | "out") -> out
| ("novembro" | "nov") -> nov
| ("dezembro" | "dez") -> dez
month_number:
| ["0"] "1" -> jan
| ["0"] "2" -> fev
| ["0"] "3" -> mar
| ["0"] "4" -> abr
| ["0"] "5" -> mai
| ["0"] "6" -> jun
| ["0"] "7" -> jul
| ["0"] "8" -> ago
| ["0"] "9" -> set
| "10" -> out
| "11" -> nov
| "12" -> dez
%import common.WORD
%import common.INT
%import common.DIGIT
%import common.WS
%ignore WS
'''
month_dict = {
"jan": 1,
"fev": 2,
"mar": 3,
"abr": 4,
"mai": 5,
"jun": 6,
"jul": 7,
"ago": 8,
"set": 9,
"out": 10,
"nov": 11,
"dez": 12
}
parser = Lark(grammar, start='date')
class NaturalDateTree(InlineTransformer):
day = int
year = int
def month(self, tree):
month_label = tree._pretty_label() # jan, fev
return month_dict[month_label]
def date(self, day, month, year):
if year < 30:
year += 2000
elif year > 30 and year < 100:
year += 1900
try:
return datetime(year, month, day)
except ValueError:
return None
def parse_date(expr):
return NaturalDateTree().transform(parser.parse(expr))
def parse(date_string, date_format=default_date_format):
try:
parsed_date = parse_date(date_string.lower())
if parsed_date is not None:
return parsed_date.strftime(date_format)
except LarkExceptions.UnexpectedCharacters:
pass
return None
def run_tests():
tests = [
('02-08-2018', '02/08/2018'),
('1.03.2018', '01/03/2018'),
('1 de fevereiro de 2018', "01/02/2018"),
('02 fevereiro 2018', "02/02/2018"),
('28FEV2017', '28/02/2017'),
('31fev2019', None),
('shitty string', None),
('10 Mai 17', '10/05/2017'),
('08 Jun 98', '08/06/1998')
]
for entry, expected in tests:
parsed = parse(entry)
assert parsed == expected, f"Expected: {expected!r}, but: {parsed!r}"
if __name__ == '__main__':
run_tests()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment