Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save inconvergent/68925fec3c1a82488473b2cc4f652899 to your computer and use it in GitHub Desktop.
Save inconvergent/68925fec3c1a82488473b2cc4f652899 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3.11
import re
import pandas as pd
flags = re.MULTILINE|re.UNICODE|re.IGNORECASE|re.DOTALL
def strip_dot(s):
return ''.join(s.split('.'))
def norm(s):
if isinstance(s, str):
ss = s
elif isinstance(s, (float, int)):
ss = str(s)
else:
ss = ''
if ss == 'nan':
ss = ''
return strip_dot(' '.join(ss.strip().splitlines())
.replace('first', '1') .replace('second', '2')
.replace('third', '3') .replace('forth', '4') .replace('fifth', '5')
.replace('sixth', '6') .replace('seventh', '7'))
re_u_or_p = re.compile(r'([U|P|P.|U.])[\w|\s]?([0-9])', flags)
re_plan = re.compile(r'(plan)[\s|.]?([0-9])', flags)
re_pu = re.compile(r'(pu)[\w|\s]?([0-9])', flags)
re_xetg = re.compile(r'([0-9])[\s|.].[etg|etg.|et.|etasje]', flags)
re_etgx = re.compile(r'etg[\s|.]?([0-9])', flags)
re_floorx = re.compile(r'flo[o]?r[\s|.]?([0-9])', flags)
re_xfloor = re.compile(r'([0-9])[\s|.]flo[o]?r', flags)
def check_UP(s):
res = re_u_or_p.findall(s)
if res:
return res
return None
# if res and res.group(1):
# print(s, res)
# return res.group(1)+res.group(2)
# return None
def check_plan(s):
res = re_plan.match(s)
if res:
return ['P'+res.group(2)]
return None
def check_PU(s):
res = re_pu.match(s)
if res:
return ['U'+res.group(2)]
return None
def check_xetg(s):
res = re_xetg.match(s)
if res:
return ['P'+res.group(1)]
return None
def check_etgx(s):
res = re_etgx.match(s)
if res:
return ['P'+res.group(1)]
return None
def tx(r):
s = norm(r)
for fx in [ check_PU, check_UP, check_plan, check_etgx, check_xetg]:
v = fx(s)
if v is not None:
return v
return None
numbers = set('1234567')
warnsym = set('-,')
def tx_mark(r):
s = norm(r)
letters = set(s)
cn = letters.intersection(numbers)
sn = letters.intersection(warnsym)
return 'x' if (cn and sn) else ''
def row_itr(df):
for _, row in df.iterrows():
new_raw = tx(row['raw'])
if new_raw:
l = list(new_raw)
n = len(l)
for s in l:
tmp = row.copy()
if isinstance(s, str):
tmp['raw3'] = s.upper()
tmp['hrs3'] = row['hrs']
elif isinstance(s, tuple):
tmp['raw3'] = f'{s[0]}{s[1]}'.upper()
tmp['hrs3'] = row['hrs']/n
tmp['split'] = 's' if len(l)>1 else ''
yield tmp
else:
x = row.copy()
x['hrs3'] = row['hrs']
yield x
def main():
# fn = "dat3.xlsx"
fn = "dat-4.xlsx"
# fn = "dat-ex.xlsx"
df = pd.read_excel(fn)
ins = df.shape
initial_sum = df['hrs'].sum()
print(initial_sum)
# print(df)
df2 = pd.DataFrame(list(row_itr(df)))
final_sum = df2['hrs3'].sum()
print(final_sum)
for _, row in df.iterrows():
print( row['raw'],' | ', norm(row['raw3']))
print('fn: ', fn , ' in:', ins, 'out:', df2.shape)
df2.to_excel(f'out-{fn}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment