Last active
April 3, 2020 04:35
-
-
Save saurabhwahile/efbcdbf12be8c5df48873a1404f07b79 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from datetime import datetime, timedelta | |
import re | |
from io import BytesIO | |
import subprocess | |
import base64 | |
import imaplib | |
import email | |
import camelot | |
import tempfile | |
import pandas as pd | |
STATEMENT_PASSWORD = '<password>' | |
""" | |
Pass a file buffer like object ex: open('file.pdf', 'wb') and the password to the statement file | |
""" | |
def parse_ecas(f_buffer, password=None): | |
f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False) | |
f_temp.write(f_buffer.read()) | |
f_temp.close() | |
tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all') | |
os.unlink(f_temp.name) | |
funds_list = [] | |
data = None | |
for table in tables: | |
if len(table.df.columns)!=7: | |
continue | |
for i, row in table.df.iterrows(): | |
if not pd.isnull(row[0]) and re.match('(\d*)\/(\d*)',row[0]): | |
if data: | |
funds_list.append(data) | |
data = { | |
"folio_no" : row[0], | |
"scheme_name":row[1], | |
"unit_balance":row[2], | |
"dt":row[3], | |
"nav":row[4], | |
"market_value":row[5], | |
"registrar":row[6] | |
} | |
else: | |
if data: | |
data['scheme_name']+= ' '+ row[1] if not pd.isnull(row[1]) else '' | |
funds_list.append(data) | |
df = pd.DataFrame(funds_list) | |
df['dt'] = pd.to_datetime(df['dt']) | |
df['market_value'] = df['market_value'].str.replace(',','').astype(float) | |
df['nav'] = df['nav'].str.replace(',','').astype(float) | |
df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float) | |
return df | |
def parse_ecas_transactions(f_buffer, password=None): | |
f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False) | |
f_temp.write(f_buffer.read()) | |
f_temp.close() | |
tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all') | |
os.unlink(f_temp.name) | |
data = [] | |
state = 0 | |
for t in tables: | |
for i, row in t.df.iterrows(): | |
re_folio_no = re.match('Folio No:(.*)', row[0]) | |
re_transaction_dt = re.match('(\d*-[a-zA-Z]*-\d*)', row[0]) | |
if re_transaction_dt and not (pd.isnull(row[3]) or row[3]==''): | |
transaction_dt = re_transaction_dt[1] | |
re_fund_name = re.match('(.*)\(Advisor:(.*)', fund_name) | |
if re_fund_name: | |
fund_name = re_fund_name[1] | |
data.append({ | |
'folio_no': folio_no.replace(' ', ''), | |
'fund_name': fund_name, | |
'dt': row[0], | |
'type': row[1], | |
'amount': row[3], | |
'units': row[4], | |
'nav': row[5], | |
'unit_balance': row[6] | |
}) | |
state = 2 | |
if state==1: | |
fund_name += row[0] | |
if re_folio_no: | |
folio_no = re_folio_no[1] | |
fund_name = '' | |
state = 1 | |
df = pd.DataFrame(data) | |
df['dt'] = pd.to_datetime(df['dt']) | |
df['amount'] = df['amount'].str.replace(',','').astype(float) | |
df['units'] = df['units'].str.replace(',','').astype(float) | |
df['nav'] = df['nav'].str.replace(',','').astype(float) | |
df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float) | |
return df | |
""" | |
Download the tika-app here https://tika.apache.org/download.html | |
""" | |
def read_pdf_tika(file_buffer, password): | |
p = subprocess.Popen(f'java -jar C:\\bin\\tika-app.jar --password={password} --text', shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) | |
return p.communicate(file_buffer.read())[0] | |
""" | |
Checks if the statement is a summary statement or if its a detailed statement | |
""" | |
def is_ecas_summary(file_buffer, password): | |
text = read_pdf_tika(file_buffer, password) | |
detailed_statement_date = None | |
for line in text.decode('utf-8',errors='ignore').split('\n'): | |
re_date_summary = re.match('(.*)As on (\d*-[a-zA-Z]*-\d*)(.*)', line) | |
if re_date_summary: | |
print(f'Summary | {re_date_summary[2]}') | |
return True | |
re_date_detailed = re.match('(.*)(\d*-[a-zA-Z]*-\d*) To (\d*-[a-zA-Z]*-\d*)(.*)', line) | |
if re_date_detailed: | |
detailed_statement_date = re_date_detailed[0] | |
print(f'Detailed Statement | {detailed_statement_date}') | |
return False | |
""" | |
Scan your inbox for E-Cas mails | |
""" | |
def scan_ecas_statement(FROM_EMAIL, FROM_PWD, SMTP_SERVER, SMTP_PORT): | |
mail = imaplib.IMAP4_SSL(SMTP_SERVER) | |
mail.login(FROM_EMAIL,FROM_PWD) | |
mail.select('inbox') | |
todays_date = datetime.strftime(datetime.now()-timedelta(days=1), "%d-%b-%Y") | |
tomorrows_date = datetime.strftime(datetime.now()+timedelta(days=1), "%d-%b-%Y") | |
typ, data = mail.search(None, 'SUBJECT "Consolidated Account Statement - CAMS Mailback Request" (SINCE "'+todays_date+'" BEFORE "'+tomorrows_date+'")') | |
mail_ids = data[0].decode() | |
id_list = mail_ids.split() | |
if len(id_list) == 0: | |
print('No Mails Received') | |
return | |
id_list.sort(reverse=True) | |
for i in id_list: | |
typ, data = mail.fetch(str(i), '(RFC822)' ) | |
for response_part in data: | |
if isinstance(response_part, tuple): | |
msg = email.message_from_string(response_part[1].decode()) | |
print(i) | |
for part in msg.walk(): | |
if part.get_content_maintype() == 'multipart': | |
continue | |
if 'application/octet-stream' not in part.get('Content-Type'): | |
continue | |
statement_file = BytesIO(part.get_payload(decode=True)) | |
if is_ecas_summary(statement_file, STATEMENT_PASSWORD): | |
statement_file.seek(0) | |
df = parse_ecas(statement_file, STATEMENT_PASSWORD) | |
else: | |
statement_file.seek(0) | |
df = parse_ecas_transactions(statement_file, STATEMENT_PASSWORD) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment