Last active
March 8, 2025 01:38
-
-
Save JD-P/fc473872bbff4b48b5235cbe4aaeba1d to your computer and use it in GitHub Desktop.
Public Single Page Twitter Archive Exporter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The vast majority of this code was written by Mistral-large and | |
# DeepSeek R1 and is therefore public domain in the United States. | |
# But just in case, this script is public domain as set out in the | |
# Creative Commons Zero 1.0 Universal Public Domain Notice | |
# https://creativecommons.org/publicdomain/zero/1.0/ | |
import argparse | |
import json | |
from datetime import datetime | |
import html | |
def parse_arguments(): | |
parser = argparse.ArgumentParser(description="Generate HTML pages for Twitter archive.") | |
parser.add_argument("tweets_file", help="Path to the tweets.js file.") | |
parser.add_argument("note_tweets_file", help="Path to the note-tweets.js file.") | |
parser.add_argument("name", help="Your name.") | |
parser.add_argument("twitter_username", help="Your Twitter username.") | |
parser.add_argument("--exclude-retweets", action="store_true", help="Exclude retweets from the output.") | |
return parser.parse_args() | |
def extract_json_from_file(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
start_index = content.find('[') | |
json_data = content[start_index:] | |
return json.loads(json_data) | |
def parse_tweet_time(tweet_time): | |
return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S +0000 %Y') | |
def parse_note_tweet_time(note_tweet_time): | |
return datetime.strptime(note_tweet_time, '%Y-%m-%dT%H:%M:%S.%fZ') | |
def convert_newlines_to_br(text): | |
return text.replace('\n', '<br>') | |
def replace_urls(text, urls): | |
for url in urls: | |
if 'url' in url and 'expanded_url' in url: | |
text = text.replace(url['url'], f'<a href="{url["expanded_url"]}" target="_blank">{url["display_url"]}</a>') | |
return text | |
def filter_edited_tweets(tweets): | |
tweet_map = {} | |
for tweet in tweets: | |
tweet_data = tweet['tweet'] | |
tweet_id = tweet_data['id_str'] | |
edit_ids = [] | |
if 'edit_info' in tweet_data: | |
if 'initial' in tweet_data['edit_info']: | |
edit_ids = tweet_data['edit_info']['initial']['editTweetIds'] | |
elif 'edit' in tweet_data['edit_info']: | |
edit_ids = tweet_data['edit_info']['edit']['editControlInitial']['editTweetIds'] | |
if len(edit_ids) > 1: | |
max_edit_id = max(edit_ids, key=int) | |
if max_edit_id == tweet_id: | |
tweet_map[tweet_id] = tweet | |
else: | |
continue | |
else: | |
tweet_map[tweet_id] = tweet | |
return list(tweet_map.values()) | |
def generate_html_content(tweets, note_tweets, name, twitter_username, page_title, back_link=None): | |
html_content = f""" | |
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>{html.escape(page_title)}</title> | |
<style> | |
body {{ | |
width: 600px; | |
margin: 0 auto; | |
font-family: Helvetica, Arial, sans-serif; | |
}} | |
.tweet {{ | |
border: 1px solid #ccc; | |
padding: 10px; | |
margin-bottom: 10px; | |
border-radius: 5px; | |
}} | |
.tweet-header {{ | |
font-weight: bold; | |
margin-bottom: 5px; | |
}} | |
.tweet-link {{ | |
text-decoration: none; | |
}} | |
.tweet-link:hover {{ | |
opacity: 50%; | |
}} | |
.tweet-time {{ | |
color: #555; | |
margin-left: 10px; | |
}} | |
.tweet-stats {{ | |
color: #555; | |
margin-top: 10px; | |
}} | |
</style> | |
</head> | |
<body> | |
<h1>{html.escape(page_title)}</h1> | |
""" | |
if back_link: | |
html_content += f'<p><a href="{html.escape(back_link[0])}">{html.escape(back_link[1])}</a></p>' | |
for tweet in tweets: | |
tweet_data = tweet['tweet'] | |
tweet_text = tweet_data['full_text'] | |
tweet_time = parse_tweet_time(tweet_data['created_at']).strftime('%Y-%m-%d %H:%M UTC') | |
tweet_timestamp = int(parse_tweet_time(tweet_data['created_at']).timestamp()) | |
tweet_id = tweet_data['id_str'] | |
tweet_url = f"https://twitter.com/{twitter_username}/status/{tweet_id}" | |
if '…' in tweet_text: | |
matching_note_tweet = next( | |
(note for note in note_tweets if parse_note_tweet_time(note['noteTweet']['createdAt']) == parse_tweet_time(tweet_data['created_at'])), | |
None | |
) | |
if matching_note_tweet: | |
note_text = matching_note_tweet['noteTweet']['core']['text'] | |
mentions = tweet_data['entities'].get('user_mentions', []) | |
mention_handles = ' '.join([f"@{mention['screen_name']}" for mention in mentions]) | |
note_urls = matching_note_tweet['noteTweet']['core'].get('urls', []) | |
note_text = replace_urls(note_text, note_urls) | |
tweet_text = f"{mention_handles} {note_text}" | |
tweet_urls = tweet_data['entities'].get('urls', []) | |
tweet_text = replace_urls(tweet_text, tweet_urls) | |
tweet_text = convert_newlines_to_br(tweet_text) | |
favorite_count = tweet_data['favorite_count'] | |
retweet_count = tweet_data['retweet_count'] | |
html_content += f""" | |
<div class="tweet"> | |
<div class="tweet-header" id={tweet_timestamp}> | |
<a class="tweet-link" href="#{tweet_id}" id={tweet_id}>🔗</a> | |
{html.escape(name)} | |
<span class="tweet-time"> | |
<a href="{tweet_url}">{tweet_time}</a> | |
</span> | |
</div> | |
<p>{tweet_text}</p> | |
<div class="tweet-stats"> | |
Likes: {favorite_count} | Retweets: {retweet_count} | |
</div> | |
</div> | |
""" | |
html_content += """<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>.""" | |
html_content += """<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>""" | |
html_content += """ | |
</body> | |
</html> | |
""" | |
return html_content | |
def generate_index_page(name, sorted_month_keys, grouped_tweets): | |
html_content = f""" | |
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>{html.escape(name)}'s Tweet Archive</title> | |
<style> | |
body {{ | |
width: 600px; | |
margin: 0 auto; | |
font-family: Helvetica, Arial, sans-serif; | |
}} | |
h1 {{ | |
margin-bottom: 20px; | |
}} | |
ul {{ | |
list-style-type: none; | |
padding: 0; | |
}} | |
li {{ | |
margin: 5px 0; | |
}} | |
a {{ | |
text-decoration: none; | |
color: #1da1f2; | |
}} | |
a:hover {{ | |
text-decoration: underline; | |
}} | |
</style> | |
</head> | |
<body> | |
<h1>{html.escape(name)}'s Tweet Archive</h1> | |
<p><a href="tweets.html">All tweets in one page</a></p> | |
<h2>Tweets by month:</h2> | |
<ul> | |
""" | |
for key in sorted_month_keys: | |
year, month = key | |
month_name = datetime(year, month, 1).strftime('%B %Y') | |
filename = f"tweets_{year}_{month:02d}.html" | |
count = len(grouped_tweets[key]) | |
html_content += f'<li><a href="{html.escape(filename)}">{html.escape(month_name)}</a> ({count} tweets)</li>' | |
html_content += """ | |
</ul> | |
<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>.</p> | |
<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p> | |
</body> | |
</html> | |
""" | |
return html_content | |
def main(): | |
args = parse_arguments() | |
tweets = extract_json_from_file(args.tweets_file) | |
note_tweets = extract_json_from_file(args.note_tweets_file) | |
# Filter and sort tweets | |
tweets = filter_edited_tweets(tweets) | |
tweets.sort(key=lambda tweet: parse_tweet_time(tweet['tweet']['created_at'])) | |
# Apply retweet filter | |
if args.exclude_retweets: | |
tweets = [tweet for tweet in tweets if not tweet['tweet']['full_text'].startswith("RT @")] | |
# Generate main page | |
main_html = generate_html_content( | |
tweets=tweets, | |
note_tweets=note_tweets, | |
name=args.name, | |
twitter_username=args.twitter_username, | |
page_title=f"{args.name}'s Tweets", | |
back_link=("tweets_index.html", "View tweets by month") | |
) | |
with open("tweets.html", "w", encoding='utf-8') as f: | |
f.write(main_html) | |
# Group tweets by month | |
grouped_tweets = {} | |
for tweet in tweets: | |
tweet_data = tweet['tweet'] | |
created_at = parse_tweet_time(tweet_data['created_at']) | |
year = created_at.year | |
month = created_at.month | |
key = (year, month) | |
if key not in grouped_tweets: | |
grouped_tweets[key] = [] | |
grouped_tweets[key].append(tweet) | |
# Generate monthly pages | |
sorted_month_keys = sorted(grouped_tweets.keys(), reverse=True, key=lambda k: (k[0], k[1])) | |
for key in sorted_month_keys: | |
year, month = key | |
month_tweets = grouped_tweets[key] | |
month_title = f"{args.name}'s Tweets - {datetime(year, month, 1).strftime('%B %Y')}" | |
filename = f"tweets_{year}_{month:02d}.html" | |
monthly_html = generate_html_content( | |
tweets=month_tweets, | |
note_tweets=note_tweets, | |
name=args.name, | |
twitter_username=args.twitter_username, | |
page_title=month_title, | |
back_link=("tweets_index.html", "Back to Archive Index") | |
) | |
with open(filename, "w", encoding='utf-8') as f: | |
f.write(monthly_html) | |
# Generate index page | |
index_html = generate_index_page(args.name, sorted_month_keys, grouped_tweets) | |
with open("tweets_index.html", "w", encoding='utf-8') as f: | |
f.write(index_html) | |
print("HTML files 'tweets.html', monthly archives, and 'tweets_index.html' have been created.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment