Last active
January 2, 2021 15:20
-
-
Save aakashb95/ccf89cc012f2c1447a035e7ca3621543 to your computer and use it in GitHub Desktop.
Whatsapp group chat to excel/csv. Run python whatsapp_extract.py <filename>.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Export required chat as text without media | |
#This supports group chats to tabular format where you have all contacts of people in the group | |
#todo: add regex for numbers as authors | |
import pandas as pd | |
import os | |
import numpy as np | |
import re | |
import sys | |
def starts_with_date(s): | |
pattern = r"^([0-9]+\/[0-9]+\/[0-9]+, [0-9]+:[0-9]+)" | |
return True if re.match(pattern, s) else False | |
def starts_with_author(s): | |
pattern = r"^([\w]+):|([\w]+[\s]+[\w]+):" | |
return True if re.match(pattern, s) else False | |
def date_time(s): | |
s_split = s.split(", ") | |
date, time = s_split[0], s_split[1] | |
return date, time | |
def get_data(line): | |
line_split = line.split(" - ") | |
date, time = date_time(line_split[0]) | |
message = " ".join(line_split[1:]) | |
if starts_with_author(message): | |
message_split = message.split(": ") | |
author = message_split[0] | |
message = " ".join(message_split[1:]) | |
else: | |
author = None | |
return date, time, author, message | |
def extract(file_name): | |
buffer = [] | |
data = [] | |
date, time, author = None, None, None | |
with open(file_name, encoding="utf-8") as f: | |
f.readline() | |
while True: | |
line = f.readline() | |
if not line: | |
break | |
line = line.strip() | |
if starts_with_date(line): | |
if len(buffer) > 0: | |
data.append([date, time, author, " ".join(buffer)]) | |
buffer.clear() | |
date, time, author, message = get_data(line) | |
buffer.append(message) | |
else: | |
buffer.append(line) | |
df = pd.DataFrame(data, columns=["Date", "Time", "Author", "Message"]) | |
return df | |
if __name__ == "__main__": | |
file_name = sys.argv[1] | |
fname = file_name.split(".")[0] | |
df = extract(file_name) | |
# output_file = f"{fname}_extract.csv" | |
# df.to_csv(output_file, index=False, encoding="utf-8") | |
output_file = f"{fname}_extract.xls" | |
df.to_excel(output_file, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment