Skip to content

Instantly share code, notes, and snippets.

@JD-P
Last active March 8, 2025 01:38
Show Gist options
  • Save JD-P/fc473872bbff4b48b5235cbe4aaeba1d to your computer and use it in GitHub Desktop.
Save JD-P/fc473872bbff4b48b5235cbe4aaeba1d to your computer and use it in GitHub Desktop.
Public Single Page Twitter Archive Exporter
# The vast majority of this code was written by Mistral-large and
# DeepSeek R1 and is therefore public domain in the United States.
# But just in case, this script is public domain as set out in the
# Creative Commons Zero 1.0 Universal Public Domain Notice
# https://creativecommons.org/publicdomain/zero/1.0/
import argparse
import json
from datetime import datetime
import html
def parse_arguments():
parser = argparse.ArgumentParser(description="Generate HTML pages for Twitter archive.")
parser.add_argument("tweets_file", help="Path to the tweets.js file.")
parser.add_argument("note_tweets_file", help="Path to the note-tweets.js file.")
parser.add_argument("name", help="Your name.")
parser.add_argument("twitter_username", help="Your Twitter username.")
parser.add_argument("--exclude-retweets", action="store_true", help="Exclude retweets from the output.")
return parser.parse_args()
def extract_json_from_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
start_index = content.find('[')
json_data = content[start_index:]
return json.loads(json_data)
def parse_tweet_time(tweet_time):
return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S +0000 %Y')
def parse_note_tweet_time(note_tweet_time):
return datetime.strptime(note_tweet_time, '%Y-%m-%dT%H:%M:%S.%fZ')
def convert_newlines_to_br(text):
return text.replace('\n', '<br>')
def replace_urls(text, urls):
for url in urls:
if 'url' in url and 'expanded_url' in url:
text = text.replace(url['url'], f'<a href="{url["expanded_url"]}" target="_blank">{url["display_url"]}</a>')
return text
def filter_edited_tweets(tweets):
tweet_map = {}
for tweet in tweets:
tweet_data = tweet['tweet']
tweet_id = tweet_data['id_str']
edit_ids = []
if 'edit_info' in tweet_data:
if 'initial' in tweet_data['edit_info']:
edit_ids = tweet_data['edit_info']['initial']['editTweetIds']
elif 'edit' in tweet_data['edit_info']:
edit_ids = tweet_data['edit_info']['edit']['editControlInitial']['editTweetIds']
if len(edit_ids) > 1:
max_edit_id = max(edit_ids, key=int)
if max_edit_id == tweet_id:
tweet_map[tweet_id] = tweet
else:
continue
else:
tweet_map[tweet_id] = tweet
return list(tweet_map.values())
def generate_html_content(tweets, note_tweets, name, twitter_username, page_title, back_link=None):
html_content = f"""
<html>
<head>
<meta charset="UTF-8">
<title>{html.escape(page_title)}</title>
<style>
body {{
width: 600px;
margin: 0 auto;
font-family: Helvetica, Arial, sans-serif;
}}
.tweet {{
border: 1px solid #ccc;
padding: 10px;
margin-bottom: 10px;
border-radius: 5px;
}}
.tweet-header {{
font-weight: bold;
margin-bottom: 5px;
}}
.tweet-link {{
text-decoration: none;
}}
.tweet-link:hover {{
opacity: 50%;
}}
.tweet-time {{
color: #555;
margin-left: 10px;
}}
.tweet-stats {{
color: #555;
margin-top: 10px;
}}
</style>
</head>
<body>
<h1>{html.escape(page_title)}</h1>
"""
if back_link:
html_content += f'<p><a href="{html.escape(back_link[0])}">{html.escape(back_link[1])}</a></p>'
for tweet in tweets:
tweet_data = tweet['tweet']
tweet_text = tweet_data['full_text']
tweet_time = parse_tweet_time(tweet_data['created_at']).strftime('%Y-%m-%d %H:%M UTC')
tweet_timestamp = int(parse_tweet_time(tweet_data['created_at']).timestamp())
tweet_id = tweet_data['id_str']
tweet_url = f"https://twitter.com/{twitter_username}/status/{tweet_id}"
if '…' in tweet_text:
matching_note_tweet = next(
(note for note in note_tweets if parse_note_tweet_time(note['noteTweet']['createdAt']) == parse_tweet_time(tweet_data['created_at'])),
None
)
if matching_note_tweet:
note_text = matching_note_tweet['noteTweet']['core']['text']
mentions = tweet_data['entities'].get('user_mentions', [])
mention_handles = ' '.join([f"@{mention['screen_name']}" for mention in mentions])
note_urls = matching_note_tweet['noteTweet']['core'].get('urls', [])
note_text = replace_urls(note_text, note_urls)
tweet_text = f"{mention_handles} {note_text}"
tweet_urls = tweet_data['entities'].get('urls', [])
tweet_text = replace_urls(tweet_text, tweet_urls)
tweet_text = convert_newlines_to_br(tweet_text)
favorite_count = tweet_data['favorite_count']
retweet_count = tweet_data['retweet_count']
html_content += f"""
<div class="tweet">
<div class="tweet-header" id={tweet_timestamp}>
<a class="tweet-link" href="#{tweet_id}" id={tweet_id}>🔗</a>
{html.escape(name)}
<span class="tweet-time">
<a href="{tweet_url}">{tweet_time}</a>
</span>
</div>
<p>{tweet_text}</p>
<div class="tweet-stats">
Likes: {favorite_count} | Retweets: {retweet_count}
</div>
</div>
"""
html_content += """<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>."""
html_content += """<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>"""
html_content += """
</body>
</html>
"""
return html_content
def generate_index_page(name, sorted_month_keys, grouped_tweets):
html_content = f"""
<html>
<head>
<meta charset="UTF-8">
<title>{html.escape(name)}'s Tweet Archive</title>
<style>
body {{
width: 600px;
margin: 0 auto;
font-family: Helvetica, Arial, sans-serif;
}}
h1 {{
margin-bottom: 20px;
}}
ul {{
list-style-type: none;
padding: 0;
}}
li {{
margin: 5px 0;
}}
a {{
text-decoration: none;
color: #1da1f2;
}}
a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<h1>{html.escape(name)}'s Tweet Archive</h1>
<p><a href="tweets.html">All tweets in one page</a></p>
<h2>Tweets by month:</h2>
<ul>
"""
for key in sorted_month_keys:
year, month = key
month_name = datetime(year, month, 1).strftime('%B %Y')
filename = f"tweets_{year}_{month:02d}.html"
count = len(grouped_tweets[key])
html_content += f'<li><a href="{html.escape(filename)}">{html.escape(month_name)}</a> ({count} tweets)</li>'
html_content += """
</ul>
<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>.</p>
<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>
</body>
</html>
"""
return html_content
def main():
args = parse_arguments()
tweets = extract_json_from_file(args.tweets_file)
note_tweets = extract_json_from_file(args.note_tweets_file)
# Filter and sort tweets
tweets = filter_edited_tweets(tweets)
tweets.sort(key=lambda tweet: parse_tweet_time(tweet['tweet']['created_at']))
# Apply retweet filter
if args.exclude_retweets:
tweets = [tweet for tweet in tweets if not tweet['tweet']['full_text'].startswith("RT @")]
# Generate main page
main_html = generate_html_content(
tweets=tweets,
note_tweets=note_tweets,
name=args.name,
twitter_username=args.twitter_username,
page_title=f"{args.name}'s Tweets",
back_link=("tweets_index.html", "View tweets by month")
)
with open("tweets.html", "w", encoding='utf-8') as f:
f.write(main_html)
# Group tweets by month
grouped_tweets = {}
for tweet in tweets:
tweet_data = tweet['tweet']
created_at = parse_tweet_time(tweet_data['created_at'])
year = created_at.year
month = created_at.month
key = (year, month)
if key not in grouped_tweets:
grouped_tweets[key] = []
grouped_tweets[key].append(tweet)
# Generate monthly pages
sorted_month_keys = sorted(grouped_tweets.keys(), reverse=True, key=lambda k: (k[0], k[1]))
for key in sorted_month_keys:
year, month = key
month_tweets = grouped_tweets[key]
month_title = f"{args.name}'s Tweets - {datetime(year, month, 1).strftime('%B %Y')}"
filename = f"tweets_{year}_{month:02d}.html"
monthly_html = generate_html_content(
tweets=month_tweets,
note_tweets=note_tweets,
name=args.name,
twitter_username=args.twitter_username,
page_title=month_title,
back_link=("tweets_index.html", "Back to Archive Index")
)
with open(filename, "w", encoding='utf-8') as f:
f.write(monthly_html)
# Generate index page
index_html = generate_index_page(args.name, sorted_month_keys, grouped_tweets)
with open("tweets_index.html", "w", encoding='utf-8') as f:
f.write(index_html)
print("HTML files 'tweets.html', monthly archives, and 'tweets_index.html' have been created.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment