Skip to content

Instantly share code, notes, and snippets.

@aakashb95
Last active January 2, 2021 15:20
Show Gist options
  • Save aakashb95/ccf89cc012f2c1447a035e7ca3621543 to your computer and use it in GitHub Desktop.
Save aakashb95/ccf89cc012f2c1447a035e7ca3621543 to your computer and use it in GitHub Desktop.
Whatsapp group chat to excel/csv. Run python whatsapp_extract.py <filename>.txt
#Export required chat as text without media
#This supports group chats to tabular format where you have all contacts of people in the group
#todo: add regex for numbers as authors
import pandas as pd
import os
import numpy as np
import re
import sys
def starts_with_date(s):
pattern = r"^([0-9]+\/[0-9]+\/[0-9]+, [0-9]+:[0-9]+)"
return True if re.match(pattern, s) else False
def starts_with_author(s):
pattern = r"^([\w]+):|([\w]+[\s]+[\w]+):"
return True if re.match(pattern, s) else False
def date_time(s):
s_split = s.split(", ")
date, time = s_split[0], s_split[1]
return date, time
def get_data(line):
line_split = line.split(" - ")
date, time = date_time(line_split[0])
message = " ".join(line_split[1:])
if starts_with_author(message):
message_split = message.split(": ")
author = message_split[0]
message = " ".join(message_split[1:])
else:
author = None
return date, time, author, message
def extract(file_name):
buffer = []
data = []
date, time, author = None, None, None
with open(file_name, encoding="utf-8") as f:
f.readline()
while True:
line = f.readline()
if not line:
break
line = line.strip()
if starts_with_date(line):
if len(buffer) > 0:
data.append([date, time, author, " ".join(buffer)])
buffer.clear()
date, time, author, message = get_data(line)
buffer.append(message)
else:
buffer.append(line)
df = pd.DataFrame(data, columns=["Date", "Time", "Author", "Message"])
return df
if __name__ == "__main__":
file_name = sys.argv[1]
fname = file_name.split(".")[0]
df = extract(file_name)
# output_file = f"{fname}_extract.csv"
# df.to_csv(output_file, index=False, encoding="utf-8")
output_file = f"{fname}_extract.xls"
df.to_excel(output_file, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment