saurabhwahile · April 3, 2020 04:35
diff --git a/parse_ecas.py b/parse_ecas.py
 import os
 import sys
 from datetime import datetime, timedelta
 import re
 from io import BytesIO
 import subprocess
 import base64
 import imaplib
 import email

 import camelot
 import tempfile
 import pandas as pd

 STATEMENT_PASSWORD = '<password>'

 """
 Pass a file buffer like object ex: open('file.pdf', 'wb') and the password to the statement file
 """
 def parse_ecas(f_buffer, password=None):
    f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
    f_temp.write(f_buffer.read())
    f_temp.close()
    tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
    os.unlink(f_temp.name)
    
    funds_list = []
    data = None
    for table in tables:
        if len(table.df.columns)!=7:
            continue
            
        for i, row in table.df.iterrows():
            if not pd.isnull(row[0]) and re.match('(\d*)\/(\d*)',row[0]):
                if data:
                    funds_list.append(data)
                data = {
                    "folio_no" : row[0],
                    "scheme_name":row[1],
                    "unit_balance":row[2],
                    "dt":row[3],
                    "nav":row[4],
                    "market_value":row[5],
                    "registrar":row[6]
                }
            else:
                if data:
                    data['scheme_name']+= ' '+ row[1] if not pd.isnull(row[1]) else ''

    funds_list.append(data)
    df = pd.DataFrame(funds_list)
    
    df['dt'] = pd.to_datetime(df['dt'])
    df['market_value'] = df['market_value'].str.replace(',','').astype(float)
    df['nav'] = df['nav'].str.replace(',','').astype(float)
    df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)
    return df
 
 def parse_ecas_transactions(f_buffer, password=None):
    f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
    f_temp.write(f_buffer.read())
    f_temp.close()
    tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
    os.unlink(f_temp.name)
    
    data = []
    state = 0 
    for t in tables:
        for i, row in t.df.iterrows():
            re_folio_no = re.match('Folio No:(.*)', row[0])
            re_transaction_dt = re.match('(\d*-[a-zA-Z]*-\d*)', row[0])

            if re_transaction_dt and not (pd.isnull(row[3]) or row[3]==''):
                transaction_dt = re_transaction_dt[1]
                re_fund_name = re.match('(.*)\(Advisor:(.*)', fund_name)
                if re_fund_name:
                    fund_name = re_fund_name[1]
                data.append({
                    'folio_no': folio_no.replace(' ', ''),
                    'fund_name': fund_name,
                    'dt': row[0],
                    'type': row[1],
                    'amount': row[3],
                    'units': row[4],
                    'nav': row[5],
                    'unit_balance': row[6]

                })
                state = 2

            if state==1:
                fund_name += row[0]

            if re_folio_no:
                folio_no = re_folio_no[1]
                fund_name = ''
                state = 1
                
    df = pd.DataFrame(data)
    df['dt'] = pd.to_datetime(df['dt'])
    df['amount'] = df['amount'].str.replace(',','').astype(float)
    df['units'] = df['units'].str.replace(',','').astype(float)
    df['nav'] = df['nav'].str.replace(',','').astype(float)
    df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)
    
    return df

 """
 Download the tika-app here https://tika.apache.org/download.html
 """
 def read_pdf_tika(file_buffer, password):
    p = subprocess.Popen(f'java -jar C:\\bin\\tika-app.jar --password={password} --text', shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
    return p.communicate(file_buffer.read())[0]
 
 """
 Checks if the statement is a summary statement or if its a detailed statement
 """
 def is_ecas_summary(file_buffer, password):
    text = read_pdf_tika(file_buffer, password)
    detailed_statement_date = None
    for line in text.decode('utf-8',errors='ignore').split('\n'):
        re_date_summary = re.match('(.*)As on (\d*-[a-zA-Z]*-\d*)(.*)', line)
        if re_date_summary:
            print(f'Summary | {re_date_summary[2]}')
            return True
        re_date_detailed = re.match('(.*)(\d*-[a-zA-Z]*-\d*) To (\d*-[a-zA-Z]*-\d*)(.*)', line)
        if re_date_detailed:
            detailed_statement_date = re_date_detailed[0]
    print(f'Detailed Statement | {detailed_statement_date}')
    return False
  
 """
 Scan your inbox for E-Cas mails
 """
 def scan_ecas_statement(FROM_EMAIL, FROM_PWD, SMTP_SERVER, SMTP_PORT):
    mail = imaplib.IMAP4_SSL(SMTP_SERVER)
    mail.login(FROM_EMAIL,FROM_PWD)
    mail.select('inbox')

    todays_date = datetime.strftime(datetime.now()-timedelta(days=1), "%d-%b-%Y")
    tomorrows_date = datetime.strftime(datetime.now()+timedelta(days=1), "%d-%b-%Y")
    
    typ, data = mail.search(None, 'SUBJECT "Consolidated Account Statement - CAMS Mailback Request" (SINCE "'+todays_date+'" BEFORE "'+tomorrows_date+'")')
    mail_ids = data[0].decode()

    id_list = mail_ids.split()
    if len(id_list) == 0:
        print('No Mails Received')
        return

    id_list.sort(reverse=True)

    for i in id_list:
        typ, data = mail.fetch(str(i), '(RFC822)' )
        for response_part in data:
            if isinstance(response_part, tuple):
                msg = email.message_from_string(response_part[1].decode())
                print(i)
                for part in msg.walk():
                    if part.get_content_maintype() == 'multipart':
                        continue
                    if 'application/octet-stream' not in part.get('Content-Type'):
                        continue
                    statement_file = BytesIO(part.get_payload(decode=True))
                    if is_ecas_summary(statement_file, STATEMENT_PASSWORD):
                        statement_file.seek(0)
                        df = parse_ecas(statement_file, STATEMENT_PASSWORD)
                    else:
                        statement_file.seek(0)
                        df = parse_ecas_transactions(statement_file, STATEMENT_PASSWORD)
	import os
	import sys
	from datetime import datetime, timedelta
	import re
	from io import BytesIO
	import subprocess
	import base64
	import imaplib
	import email

	import camelot
	import tempfile
	import pandas as pd

	STATEMENT_PASSWORD = '<password>'

	"""
	Pass a file buffer like object ex: open('file.pdf', 'wb') and the password to the statement file
	"""
	def parse_ecas(f_buffer, password=None):
	f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
	f_temp.write(f_buffer.read())
	f_temp.close()
	tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
	os.unlink(f_temp.name)

	funds_list = []
	data = None
	for table in tables:
	if len(table.df.columns)!=7:
	continue

	for i, row in table.df.iterrows():
	if not pd.isnull(row[0]) and re.match('(\d)\/(\d)',row[0]):
	if data:
	funds_list.append(data)
	data = {
	"folio_no" : row[0],
	"scheme_name":row[1],
	"unit_balance":row[2],
	"dt":row[3],
	"nav":row[4],
	"market_value":row[5],
	"registrar":row[6]
	}
	else:
	if data:
	data['scheme_name']+= ' '+ row[1] if not pd.isnull(row[1]) else ''

	funds_list.append(data)
	df = pd.DataFrame(funds_list)

	df['dt'] = pd.to_datetime(df['dt'])
	df['market_value'] = df['market_value'].str.replace(',','').astype(float)
	df['nav'] = df['nav'].str.replace(',','').astype(float)
	df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)
	return df

	def parse_ecas_transactions(f_buffer, password=None):
	f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
	f_temp.write(f_buffer.read())
	f_temp.close()
	tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
	os.unlink(f_temp.name)

	data = []
	state = 0
	for t in tables:
	for i, row in t.df.iterrows():
	re_folio_no = re.match('Folio No:(.*)', row[0])
	re_transaction_dt = re.match('(\d-[a-zA-Z]-\d*)', row[0])

	if re_transaction_dt and not (pd.isnull(row[3]) or row[3]==''):
	transaction_dt = re_transaction_dt[1]
	re_fund_name = re.match('(.)\(Advisor:(.)', fund_name)
	if re_fund_name:
	fund_name = re_fund_name[1]
	data.append({
	'folio_no': folio_no.replace(' ', ''),
	'fund_name': fund_name,
	'dt': row[0],
	'type': row[1],
	'amount': row[3],
	'units': row[4],
	'nav': row[5],
	'unit_balance': row[6]

	})
	state = 2

	if state==1:
	fund_name += row[0]

	if re_folio_no:
	folio_no = re_folio_no[1]
	fund_name = ''
	state = 1

	df = pd.DataFrame(data)
	df['dt'] = pd.to_datetime(df['dt'])
	df['amount'] = df['amount'].str.replace(',','').astype(float)
	df['units'] = df['units'].str.replace(',','').astype(float)
	df['nav'] = df['nav'].str.replace(',','').astype(float)
	df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)

	return df

	"""
	Download the tika-app here https://tika.apache.org/download.html
	"""
	def read_pdf_tika(file_buffer, password):
	p = subprocess.Popen(f'java -jar C:\\bin\\tika-app.jar --password={password} --text', shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
	return p.communicate(file_buffer.read())[0]

	"""
	Checks if the statement is a summary statement or if its a detailed statement
	"""
	def is_ecas_summary(file_buffer, password):
	text = read_pdf_tika(file_buffer, password)
	detailed_statement_date = None
	for line in text.decode('utf-8',errors='ignore').split('\n'):
	re_date_summary = re.match('(.)As on (\d-[a-zA-Z]-\d)(.*)', line)
	if re_date_summary:
	print(f'Summary \| {re_date_summary[2]}')
	return True
	re_date_detailed = re.match('(.)(\d-[a-zA-Z]-\d) To (\d-[a-zA-Z]-\d)(.)', line)
	if re_date_detailed:
	detailed_statement_date = re_date_detailed[0]
	print(f'Detailed Statement \| {detailed_statement_date}')
	return False

	"""
	Scan your inbox for E-Cas mails
	"""
	def scan_ecas_statement(FROM_EMAIL, FROM_PWD, SMTP_SERVER, SMTP_PORT):
	mail = imaplib.IMAP4_SSL(SMTP_SERVER)
	mail.login(FROM_EMAIL,FROM_PWD)
	mail.select('inbox')

	todays_date = datetime.strftime(datetime.now()-timedelta(days=1), "%d-%b-%Y")
	tomorrows_date = datetime.strftime(datetime.now()+timedelta(days=1), "%d-%b-%Y")

	typ, data = mail.search(None, 'SUBJECT "Consolidated Account Statement - CAMS Mailback Request" (SINCE "'+todays_date+'" BEFORE "'+tomorrows_date+'")')
	mail_ids = data[0].decode()

	id_list = mail_ids.split()
	if len(id_list) == 0:
	print('No Mails Received')
	return

	id_list.sort(reverse=True)

	for i in id_list:
	typ, data = mail.fetch(str(i), '(RFC822)' )
	for response_part in data:
	if isinstance(response_part, tuple):
	msg = email.message_from_string(response_part[1].decode())
	print(i)
	for part in msg.walk():
	if part.get_content_maintype() == 'multipart':
	continue
	if 'application/octet-stream' not in part.get('Content-Type'):
	continue
	statement_file = BytesIO(part.get_payload(decode=True))
	if is_ecas_summary(statement_file, STATEMENT_PASSWORD):
	statement_file.seek(0)
	df = parse_ecas(statement_file, STATEMENT_PASSWORD)
	else:
	statement_file.seek(0)
	df = parse_ecas_transactions(statement_file, STATEMENT_PASSWORD)