Last active
June 3, 2019 11:14
-
-
Save LaBlazer/5f8b0d0cfd97110820104ed252928aa3 to your computer and use it in GitHub Desktop.
Classifies date formats and outputs them in standard format (day/month/year)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Kopirajt LBLZR_ lmao | |
import os.path, codecs, sys | |
date_files = ["dates1.txt", "dates2.txt", "dates3.txt", "dates4.txt"] | |
output_file = "dates_out.txt" | |
def save_list(listt, filename): | |
with codecs.open(filename, "w", "utf-8") as fp: | |
#firstly we delete old entries | |
fp.truncate() | |
for item in listt: | |
fp.write("{}\r\n".format(item)) | |
def load_list(filename): | |
#check if exists | |
if os.path.isfile(filename): | |
with codecs.open(filename, "r", "utf-8") as fp: | |
listt = fp.readlines() | |
#remove whitespace characters at the end of each line | |
listt = [x.strip() for x in listt] | |
return listt | |
return [] | |
def process_dates(dates): | |
print("Classifying date format") | |
out_dates = [] | |
delimeters = ['.', '/'] | |
types = [0,0,0] | |
type_string = ["unknown", "day", "month", "year"] # 1 = day, 2 = month, 3 = year | |
for date in dates: | |
for idx, s in enumerate(''.join([o if not o in delimeters else ' ' for o in list(date)]).split()): | |
s = int(s) | |
if(s <= 12): | |
types[idx] = 2 #month | |
elif(s <= 31): | |
types[idx] = 1 #day | |
else: | |
types[idx] = 3 #year | |
if sum(types) == 6: | |
break | |
print(f"Date format: {type_string[types[0]]}/{type_string[types[1]]}/{type_string[types[2]]}") | |
print("Changing to standard format (day/month/year)...") | |
# Create lookup table | |
type_lookup = [0, 0, 0] | |
for id, i in enumerate(types): | |
if(i == 1): | |
type_lookup[id] = 0 | |
if(i == 2): | |
type_lookup[id] = 1 | |
if(i == 3): | |
type_lookup[id] = 2 | |
print(f"Lookup table: {type_lookup}") | |
for date in dates: | |
split_date = ''.join([o if not o in delimeters else ' ' for o in list(date)]).split() | |
out_dates.append(f"{int(split_date[type_lookup[0]])}/{int(split_date[type_lookup[1]])}/{int(split_date[type_lookup[2]])}") | |
return out_dates | |
dates_out = [] | |
for filename in date_files: | |
print(f"Processing file {filename}") | |
dates_out.extend(process_dates(load_list(filename))) | |
print(f"Saving dates to {output_file}") | |
save_list(dates_out, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment