Created
December 9, 2024 09:39
-
-
Save DonGuillotine/fbee3dce17523f48a604957ab5ec8871 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter | |
from datetime import datetime | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from typing import Dict, List, Tuple | |
import seaborn as sns | |
class WhatsAppAnalyzer: | |
def __init__(self, file_path: str): | |
self.file_path = file_path | |
self.messages = [] | |
self.pattern = r'(\d{1,2}/\d{1,2}/\d{2}),\s(\d{1,2}:\d{2}\s[AP]M)\s-\s([^:]+):\s(.+)' | |
def load_data(self) -> None: | |
"""Load and parse the chat file""" | |
with open(self.file_path, 'r', encoding='utf-8') as file: | |
current_msg = '' | |
for line in file: | |
if re.match(self.pattern, line): | |
if current_msg: | |
self.messages.append(current_msg) | |
current_msg = line.strip() | |
else: | |
current_msg += ' ' + line.strip() | |
if current_msg: | |
self.messages.append(current_msg) | |
def parse_messages(self) -> pd.DataFrame: | |
"""Convert messages to DataFrame""" | |
data = [] | |
for msg in self.messages: | |
match = re.match(self.pattern, msg) | |
if match: | |
date, time, sender, content = match.groups() | |
try: | |
datetime_obj = datetime.strptime(f"{date} {time}", "%m/%d/%y %I:%M %p") | |
data.append({ | |
'date': datetime_obj.date(), | |
'time': datetime_obj.time(), | |
'hour': datetime_obj.hour, | |
'sender': sender.strip(), | |
'content': content.strip(), | |
'is_media': content == '<Media omitted>', | |
'message_length': len(content), | |
'day_of_week': datetime_obj.strftime('%A') | |
}) | |
except ValueError: | |
continue | |
return pd.DataFrame(data) | |
def generate_analytics(self) -> Dict: | |
"""Generate comprehensive analytics""" | |
df = self.parse_messages() | |
df_2024 = df[ | |
(df['date'] >= datetime(2024, 1, 1).date()) & | |
(df['date'] <= datetime(2024, 12, 31).date()) & | |
(df['is_media'] == False) | |
] | |
analytics = { | |
'total_messages': len(df_2024), | |
'total_participants': len(df_2024['sender'].unique()), | |
'date_range': f"{df_2024['date'].min()} to {df_2024['date'].max()}", | |
'most_active_users': df_2024['sender'].value_counts().head(10).to_dict(), | |
'avg_message_length': df_2024['message_length'].mean(), | |
'messages_per_day': df_2024.groupby('date').size().mean(), | |
'busiest_days': df_2024['day_of_week'].value_counts().to_dict(), | |
'busiest_hours': df_2024['hour'].value_counts().sort_index().to_dict() | |
} | |
return analytics, df_2024 | |
def plot_analytics(self, df: pd.DataFrame) -> None: | |
"""Generate visualization plots""" | |
fig = plt.figure(figsize=(20, 15)) | |
# 1. Messages by User | |
plt.subplot(2, 2, 1) | |
user_counts = df['sender'].value_counts().head(10) | |
plt.bar(user_counts.index, user_counts.values) | |
plt.title('Top 10 Most Active Users') | |
plt.xlabel('Users') | |
plt.ylabel('Number of Messages') | |
plt.xticks(rotation=45, ha='right') | |
# 2. Activity by Hour | |
plt.subplot(2, 2, 2) | |
hourly_activity = df['hour'].value_counts().sort_index() | |
plt.plot(hourly_activity.index, hourly_activity.values) | |
plt.title('Activity by Hour') | |
plt.xlabel('Hour of Day') | |
plt.ylabel('Number of Messages') | |
plt.grid(True) | |
# 3. Activity by Day of Week | |
plt.subplot(2, 2, 3) | |
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
daily_activity = df['day_of_week'].value_counts() | |
daily_activity = daily_activity.reindex(day_order) | |
plt.bar(daily_activity.index, daily_activity.values) | |
plt.title('Activity by Day of Week') | |
plt.xticks(rotation=45, ha='right') | |
plt.ylabel('Number of Messages') | |
# 4. Messages Over Time | |
plt.subplot(2, 2, 4) | |
daily_msgs = df.groupby('date').size() | |
plt.plot(daily_msgs.index, daily_msgs.values) | |
plt.title('Messages Over Time') | |
plt.xticks(rotation=45, ha='right') | |
plt.ylabel('Number of Messages') | |
plt.grid(True) | |
plt.tight_layout() | |
plt.savefig('whatsapp_analytics.png') | |
plt.close() | |
def main(): | |
analyzer = WhatsAppAnalyzer('chat.txt') | |
analyzer.load_data() | |
analytics, df = analyzer.generate_analytics() | |
# Print analytics | |
print("\n=== WhatsApp Chat Analytics ===") | |
print(f"\nTotal Messages: {analytics['total_messages']}") | |
print(f"Total Participants: {analytics['total_participants']}") | |
print(f"Date Range: {analytics['date_range']}") | |
print(f"Average Messages per Day: {analytics['messages_per_day']:.2f}") | |
print(f"Average Message Length: {analytics['avg_message_length']:.2f} characters") | |
print("\nMost Active Users:") | |
for user, count in analytics['most_active_users'].items(): | |
print(f"{user}: {count} messages") | |
print("\nBusiest Days of Week:") | |
for day, count in analytics['busiest_days'].items(): | |
print(f"{day}: {count} messages") | |
# Generate plots | |
analyzer.plot_analytics(df) | |
print("\nVisualizations have been saved as 'whatsapp_analytics.png'") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment