Skip to content

Instantly share code, notes, and snippets.

@DonGuillotine
Created December 9, 2024 09:39
Show Gist options
  • Save DonGuillotine/fbee3dce17523f48a604957ab5ec8871 to your computer and use it in GitHub Desktop.
Save DonGuillotine/fbee3dce17523f48a604957ab5ec8871 to your computer and use it in GitHub Desktop.
import re
from collections import Counter
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
import seaborn as sns
class WhatsAppAnalyzer:
def __init__(self, file_path: str):
self.file_path = file_path
self.messages = []
self.pattern = r'(\d{1,2}/\d{1,2}/\d{2}),\s(\d{1,2}:\d{2}\s[AP]M)\s-\s([^:]+):\s(.+)'
def load_data(self) -> None:
"""Load and parse the chat file"""
with open(self.file_path, 'r', encoding='utf-8') as file:
current_msg = ''
for line in file:
if re.match(self.pattern, line):
if current_msg:
self.messages.append(current_msg)
current_msg = line.strip()
else:
current_msg += ' ' + line.strip()
if current_msg:
self.messages.append(current_msg)
def parse_messages(self) -> pd.DataFrame:
"""Convert messages to DataFrame"""
data = []
for msg in self.messages:
match = re.match(self.pattern, msg)
if match:
date, time, sender, content = match.groups()
try:
datetime_obj = datetime.strptime(f"{date} {time}", "%m/%d/%y %I:%M %p")
data.append({
'date': datetime_obj.date(),
'time': datetime_obj.time(),
'hour': datetime_obj.hour,
'sender': sender.strip(),
'content': content.strip(),
'is_media': content == '<Media omitted>',
'message_length': len(content),
'day_of_week': datetime_obj.strftime('%A')
})
except ValueError:
continue
return pd.DataFrame(data)
def generate_analytics(self) -> Dict:
"""Generate comprehensive analytics"""
df = self.parse_messages()
df_2024 = df[
(df['date'] >= datetime(2024, 1, 1).date()) &
(df['date'] <= datetime(2024, 12, 31).date()) &
(df['is_media'] == False)
]
analytics = {
'total_messages': len(df_2024),
'total_participants': len(df_2024['sender'].unique()),
'date_range': f"{df_2024['date'].min()} to {df_2024['date'].max()}",
'most_active_users': df_2024['sender'].value_counts().head(10).to_dict(),
'avg_message_length': df_2024['message_length'].mean(),
'messages_per_day': df_2024.groupby('date').size().mean(),
'busiest_days': df_2024['day_of_week'].value_counts().to_dict(),
'busiest_hours': df_2024['hour'].value_counts().sort_index().to_dict()
}
return analytics, df_2024
def plot_analytics(self, df: pd.DataFrame) -> None:
"""Generate visualization plots"""
fig = plt.figure(figsize=(20, 15))
# 1. Messages by User
plt.subplot(2, 2, 1)
user_counts = df['sender'].value_counts().head(10)
plt.bar(user_counts.index, user_counts.values)
plt.title('Top 10 Most Active Users')
plt.xlabel('Users')
plt.ylabel('Number of Messages')
plt.xticks(rotation=45, ha='right')
# 2. Activity by Hour
plt.subplot(2, 2, 2)
hourly_activity = df['hour'].value_counts().sort_index()
plt.plot(hourly_activity.index, hourly_activity.values)
plt.title('Activity by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Messages')
plt.grid(True)
# 3. Activity by Day of Week
plt.subplot(2, 2, 3)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_activity = df['day_of_week'].value_counts()
daily_activity = daily_activity.reindex(day_order)
plt.bar(daily_activity.index, daily_activity.values)
plt.title('Activity by Day of Week')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of Messages')
# 4. Messages Over Time
plt.subplot(2, 2, 4)
daily_msgs = df.groupby('date').size()
plt.plot(daily_msgs.index, daily_msgs.values)
plt.title('Messages Over Time')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of Messages')
plt.grid(True)
plt.tight_layout()
plt.savefig('whatsapp_analytics.png')
plt.close()
def main():
analyzer = WhatsAppAnalyzer('chat.txt')
analyzer.load_data()
analytics, df = analyzer.generate_analytics()
# Print analytics
print("\n=== WhatsApp Chat Analytics ===")
print(f"\nTotal Messages: {analytics['total_messages']}")
print(f"Total Participants: {analytics['total_participants']}")
print(f"Date Range: {analytics['date_range']}")
print(f"Average Messages per Day: {analytics['messages_per_day']:.2f}")
print(f"Average Message Length: {analytics['avg_message_length']:.2f} characters")
print("\nMost Active Users:")
for user, count in analytics['most_active_users'].items():
print(f"{user}: {count} messages")
print("\nBusiest Days of Week:")
for day, count in analytics['busiest_days'].items():
print(f"{day}: {count} messages")
# Generate plots
analyzer.plot_analytics(df)
print("\nVisualizations have been saved as 'whatsapp_analytics.png'")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment