JD-P · March 8, 2025 01:38
diff --git a/single_page_twitter_archive.py b/single_page_twitter_archive.py
 # The vast majority of this code was written by Mistral-large and 
 # DeepSeek R1 and is therefore public domain in the United States.
 # But just in case, this script is public domain as set out in the 
 # Creative Commons Zero 1.0 Universal Public Domain Notice
 # https://creativecommons.org/publicdomain/zero/1.0/

 import argparse
 import json
 from datetime import datetime
 import html

 def parse_arguments():
    parser = argparse.ArgumentParser(description="Generate HTML pages for Twitter archive.")
    parser.add_argument("tweets_file", help="Path to the tweets.js file.")
    parser.add_argument("note_tweets_file", help="Path to the note-tweets.js file.")
    parser.add_argument("name", help="Your name.")
    parser.add_argument("twitter_username", help="Your Twitter username.")
    parser.add_argument("--exclude-retweets", action="store_true", help="Exclude retweets from the output.")
    return parser.parse_args()

 def extract_json_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        start_index = content.find('[')
        json_data = content[start_index:]
        return json.loads(json_data)

 def parse_tweet_time(tweet_time):
    return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S +0000 %Y')

 def parse_note_tweet_time(note_tweet_time):
    return datetime.strptime(note_tweet_time, '%Y-%m-%dT%H:%M:%S.%fZ')

 def convert_newlines_to_br(text):
    return text.replace('\n', '<br>')

 def replace_urls(text, urls):
    for url in urls:
        if 'url' in url and 'expanded_url' in url:
            text = text.replace(url['url'], f'<a href="{url["expanded_url"]}" target="_blank">{url["display_url"]}</a>')
    return text

 def filter_edited_tweets(tweets):
    tweet_map = {}
    for tweet in tweets:
        tweet_data = tweet['tweet']
        tweet_id = tweet_data['id_str']

        edit_ids = []
        if 'edit_info' in tweet_data:
            if 'initial' in tweet_data['edit_info']:
                edit_ids = tweet_data['edit_info']['initial']['editTweetIds']
            elif 'edit' in tweet_data['edit_info']:
                edit_ids = tweet_data['edit_info']['edit']['editControlInitial']['editTweetIds']

        if len(edit_ids) > 1:
            max_edit_id = max(edit_ids, key=int)
            if max_edit_id == tweet_id:
                tweet_map[tweet_id] = tweet
            else:
                continue
        else:
            tweet_map[tweet_id] = tweet

    return list(tweet_map.values())

 def generate_html_content(tweets, note_tweets, name, twitter_username, page_title, back_link=None):
    html_content = f"""
    <html>
    <head>
        <meta charset="UTF-8">
        <title>{html.escape(page_title)}</title>
        <style>
            body {{
                width: 600px;
                margin: 0 auto;
                font-family: Helvetica, Arial, sans-serif;
            }}
            .tweet {{
                border: 1px solid #ccc;
                padding: 10px;
                margin-bottom: 10px;
                border-radius: 5px;
            }}
            .tweet-header {{
                font-weight: bold;
                margin-bottom: 5px;
            }}
            .tweet-link {{
                text-decoration: none;
            }}
            .tweet-link:hover {{
                opacity: 50%;
            }}
            .tweet-time {{
                color: #555;
                margin-left: 10px;
            }}
            .tweet-stats {{
                color: #555;
                margin-top: 10px;
            }}
        </style>
    </head>
    <body>
        <h1>{html.escape(page_title)}</h1>
    """

    if back_link:
        html_content += f'<p><a href="{html.escape(back_link[0])}">{html.escape(back_link[1])}</a></p>'

    for tweet in tweets:
        tweet_data = tweet['tweet']
        tweet_text = tweet_data['full_text']

        tweet_time = parse_tweet_time(tweet_data['created_at']).strftime('%Y-%m-%d %H:%M UTC')
        tweet_timestamp = int(parse_tweet_time(tweet_data['created_at']).timestamp())
        tweet_id = tweet_data['id_str']
        tweet_url = f"https://twitter.com/{twitter_username}/status/{tweet_id}"

        if '…' in tweet_text:
            matching_note_tweet = next(
                (note for note in note_tweets if parse_note_tweet_time(note['noteTweet']['createdAt']) == parse_tweet_time(tweet_data['created_at'])),
                None
            )
            if matching_note_tweet:
                note_text = matching_note_tweet['noteTweet']['core']['text']
                mentions = tweet_data['entities'].get('user_mentions', [])
                mention_handles = ' '.join([f"@{mention['screen_name']}" for mention in mentions])
                note_urls = matching_note_tweet['noteTweet']['core'].get('urls', [])
                note_text = replace_urls(note_text, note_urls)
                tweet_text = f"{mention_handles} {note_text}"

        tweet_urls = tweet_data['entities'].get('urls', [])
        tweet_text = replace_urls(tweet_text, tweet_urls)
        tweet_text = convert_newlines_to_br(tweet_text)

        favorite_count = tweet_data['favorite_count']
        retweet_count = tweet_data['retweet_count']

        html_content += f"""
        <div class="tweet">
            <div class="tweet-header" id={tweet_timestamp}>
                <a class="tweet-link" href="#{tweet_id}" id={tweet_id}>🔗</a>
                {html.escape(name)}
                <span class="tweet-time">
                    <a href="{tweet_url}">{tweet_time}</a>
                </span>
            </div>
            <p>{tweet_text}</p>
            <div class="tweet-stats">
                Likes: {favorite_count} | Retweets: {retweet_count}
            </div>
        </div>
        """

    html_content += """<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>."""

    html_content += """<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>"""

    html_content += """
    </body>
    </html>
    """

    return html_content

 def generate_index_page(name, sorted_month_keys, grouped_tweets):
    html_content = f"""
    <html>
    <head>
        <meta charset="UTF-8">
        <title>{html.escape(name)}'s Tweet Archive</title>
        <style>
            body {{
                width: 600px;
                margin: 0 auto;
                font-family: Helvetica, Arial, sans-serif;
            }}
            h1 {{
                margin-bottom: 20px;
            }}
            ul {{
                list-style-type: none;
                padding: 0;
            }}
            li {{
                margin: 5px 0;
            }}
            a {{
                text-decoration: none;
                color: #1da1f2;
            }}
            a:hover {{
                text-decoration: underline;
            }}
        </style>
    </head>
    <body>
        <h1>{html.escape(name)}'s Tweet Archive</h1>
        <p><a href="tweets.html">All tweets in one page</a></p>
        <h2>Tweets by month:</h2>
        <ul>
    """

    for key in sorted_month_keys:
        year, month = key
        month_name = datetime(year, month, 1).strftime('%B %Y')
        filename = f"tweets_{year}_{month:02d}.html"
        count = len(grouped_tweets[key])
        html_content += f'<li><a href="{html.escape(filename)}">{html.escape(month_name)}</a> ({count} tweets)</li>'

    html_content += """
        </ul>
        <p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>.</p>
        <p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>
    </body>
    </html>
    """
    return html_content

 def main():
    args = parse_arguments()
    tweets = extract_json_from_file(args.tweets_file)
    note_tweets = extract_json_from_file(args.note_tweets_file)

    # Filter and sort tweets
    tweets = filter_edited_tweets(tweets)
    tweets.sort(key=lambda tweet: parse_tweet_time(tweet['tweet']['created_at']))

    # Apply retweet filter
    if args.exclude_retweets:
        tweets = [tweet for tweet in tweets if not tweet['tweet']['full_text'].startswith("RT @")]

    # Generate main page
    main_html = generate_html_content(
        tweets=tweets,
        note_tweets=note_tweets,
        name=args.name,
        twitter_username=args.twitter_username,
        page_title=f"{args.name}'s Tweets",
        back_link=("tweets_index.html", "View tweets by month")
    )
    with open("tweets.html", "w", encoding='utf-8') as f:
        f.write(main_html)

    # Group tweets by month
    grouped_tweets = {}
    for tweet in tweets:
        tweet_data = tweet['tweet']
        created_at = parse_tweet_time(tweet_data['created_at'])
        year = created_at.year
        month = created_at.month
        key = (year, month)
        if key not in grouped_tweets:
            grouped_tweets[key] = []
        grouped_tweets[key].append(tweet)

    # Generate monthly pages
    sorted_month_keys = sorted(grouped_tweets.keys(), reverse=True, key=lambda k: (k[0], k[1]))
    for key in sorted_month_keys:
        year, month = key
        month_tweets = grouped_tweets[key]
        month_title = f"{args.name}'s Tweets - {datetime(year, month, 1).strftime('%B %Y')}"
        filename = f"tweets_{year}_{month:02d}.html"

        monthly_html = generate_html_content(
            tweets=month_tweets,
            note_tweets=note_tweets,
            name=args.name,
            twitter_username=args.twitter_username,
            page_title=month_title,
            back_link=("tweets_index.html", "Back to Archive Index")
        )

        with open(filename, "w", encoding='utf-8') as f:
            f.write(monthly_html)

    # Generate index page
    index_html = generate_index_page(args.name, sorted_month_keys, grouped_tweets)
    with open("tweets_index.html", "w", encoding='utf-8') as f:
        f.write(index_html)

    print("HTML files 'tweets.html', monthly archives, and 'tweets_index.html' have been created.")

 if __name__ == "__main__":
    main()
	# The vast majority of this code was written by Mistral-large and
	# DeepSeek R1 and is therefore public domain in the United States.
	# But just in case, this script is public domain as set out in the
	# Creative Commons Zero 1.0 Universal Public Domain Notice
	# https://creativecommons.org/publicdomain/zero/1.0/

	import argparse
	import json
	from datetime import datetime
	import html

	def parse_arguments():
	parser = argparse.ArgumentParser(description="Generate HTML pages for Twitter archive.")
	parser.add_argument("tweets_file", help="Path to the tweets.js file.")
	parser.add_argument("note_tweets_file", help="Path to the note-tweets.js file.")
	parser.add_argument("name", help="Your name.")
	parser.add_argument("twitter_username", help="Your Twitter username.")
	parser.add_argument("--exclude-retweets", action="store_true", help="Exclude retweets from the output.")
	return parser.parse_args()

	def extract_json_from_file(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()
	start_index = content.find('[')
	json_data = content[start_index:]
	return json.loads(json_data)

	def parse_tweet_time(tweet_time):
	return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S +0000 %Y')

	def parse_note_tweet_time(note_tweet_time):
	return datetime.strptime(note_tweet_time, '%Y-%m-%dT%H:%M:%S.%fZ')

	def convert_newlines_to_br(text):
	return text.replace('\n', '<br>')

	def replace_urls(text, urls):
	for url in urls:
	if 'url' in url and 'expanded_url' in url:
	text = text.replace(url['url'], f'<a href="{url["expanded_url"]}" target="_blank">{url["display_url"]}</a>')
	return text

	def filter_edited_tweets(tweets):
	tweet_map = {}
	for tweet in tweets:
	tweet_data = tweet['tweet']
	tweet_id = tweet_data['id_str']

	edit_ids = []
	if 'edit_info' in tweet_data:
	if 'initial' in tweet_data['edit_info']:
	edit_ids = tweet_data['edit_info']['initial']['editTweetIds']
	elif 'edit' in tweet_data['edit_info']:
	edit_ids = tweet_data['edit_info']['edit']['editControlInitial']['editTweetIds']

	if len(edit_ids) > 1:
	max_edit_id = max(edit_ids, key=int)
	if max_edit_id == tweet_id:
	tweet_map[tweet_id] = tweet
	else:
	continue
	else:
	tweet_map[tweet_id] = tweet

	return list(tweet_map.values())

	def generate_html_content(tweets, note_tweets, name, twitter_username, page_title, back_link=None):
	html_content = f"""
	<html>
	<head>
	<meta charset="UTF-8">
	<title>{html.escape(page_title)}</title>
	<style>
	body {{
	width: 600px;
	margin: 0 auto;
	font-family: Helvetica, Arial, sans-serif;
	}}
	.tweet {{
	border: 1px solid #ccc;
	padding: 10px;
	margin-bottom: 10px;
	border-radius: 5px;
	}}
	.tweet-header {{
	font-weight: bold;
	margin-bottom: 5px;
	}}
	.tweet-link {{
	text-decoration: none;
	}}
	.tweet-link:hover {{
	opacity: 50%;
	}}
	.tweet-time {{
	color: #555;
	margin-left: 10px;
	}}
	.tweet-stats {{
	color: #555;
	margin-top: 10px;
	}}
	</style>
	</head>
	<body>
	<h1>{html.escape(page_title)}</h1>
	"""

	if back_link:
	html_content += f'<p><a href="{html.escape(back_link[0])}">{html.escape(back_link[1])}</a></p>'

	for tweet in tweets:
	tweet_data = tweet['tweet']
	tweet_text = tweet_data['full_text']

	tweet_time = parse_tweet_time(tweet_data['created_at']).strftime('%Y-%m-%d %H:%M UTC')
	tweet_timestamp = int(parse_tweet_time(tweet_data['created_at']).timestamp())
	tweet_id = tweet_data['id_str']
	tweet_url = f"https://twitter.com/{twitter_username}/status/{tweet_id}"

	if '…' in tweet_text:
	matching_note_tweet = next(
	(note for note in note_tweets if parse_note_tweet_time(note['noteTweet']['createdAt']) == parse_tweet_time(tweet_data['created_at'])),
	None
	)
	if matching_note_tweet:
	note_text = matching_note_tweet['noteTweet']['core']['text']
	mentions = tweet_data['entities'].get('user_mentions', [])
	mention_handles = ' '.join([f"@{mention['screen_name']}" for mention in mentions])
	note_urls = matching_note_tweet['noteTweet']['core'].get('urls', [])
	note_text = replace_urls(note_text, note_urls)
	tweet_text = f"{mention_handles} {note_text}"

	tweet_urls = tweet_data['entities'].get('urls', [])
	tweet_text = replace_urls(tweet_text, tweet_urls)
	tweet_text = convert_newlines_to_br(tweet_text)

	favorite_count = tweet_data['favorite_count']
	retweet_count = tweet_data['retweet_count']

	html_content += f"""
	<div class="tweet">
	<div class="tweet-header" id={tweet_timestamp}>
	<a class="tweet-link" href="#{tweet_id}" id={tweet_id}>🔗</a>
	{html.escape(name)}
	<span class="tweet-time">
	<a href="{tweet_url}">{tweet_time}</a>
	</span>
	</div>
	<p>{tweet_text}</p>
	<div class="tweet-stats">
	Likes: {favorite_count} \| Retweets: {retweet_count}
	</div>
	</div>
	"""

	html_content += """<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>."""

	html_content += """<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>"""

	html_content += """
	</body>
	</html>
	"""

	return html_content

	def generate_index_page(name, sorted_month_keys, grouped_tweets):
	html_content = f"""
	<html>
	<head>
	<meta charset="UTF-8">
	<title>{html.escape(name)}'s Tweet Archive</title>
	<style>
	body {{
	width: 600px;
	margin: 0 auto;
	font-family: Helvetica, Arial, sans-serif;
	}}
	h1 {{
	margin-bottom: 20px;
	}}
	ul {{
	list-style-type: none;
	padding: 0;
	}}
	li {{
	margin: 5px 0;
	}}
	a {{
	text-decoration: none;
	color: #1da1f2;
	}}
	a:hover {{
	text-decoration: underline;
	}}
	</style>
	</head>
	<body>
	<h1>{html.escape(name)}'s Tweet Archive</h1>
	<p><a href="tweets.html">All tweets in one page</a></p>
	<h2>Tweets by month:</h2>
	<ul>
	"""

	for key in sorted_month_keys:
	year, month = key
	month_name = datetime(year, month, 1).strftime('%B %Y')
	filename = f"tweets_{year}_{month:02d}.html"
	count = len(grouped_tweets[key])
	html_content += f'<li><a href="{html.escape(filename)}">{html.escape(month_name)}</a> ({count} tweets)</li>'

	html_content += """
	</ul>
	<p>Want your own Twitter archive? <a href="https://gist.github.com/JD-P/fc473872bbff4b48b5235cbe4aaeba1d">Modify this script</a>.</p>
	<p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">Twitter Archive</span> by <a rel="cc:attributionURL dct:creator" property="cc:attributionName" href="https://jdpressman.com/">John David Pressman</a> is marked with <a href="https://creativecommons.org/publicdomain/zero/1.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC0 1.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" alt=""><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/zero.svg?ref=chooser-v1" alt=""></a></p>
	</body>
	</html>
	"""
	return html_content

	def main():
	args = parse_arguments()
	tweets = extract_json_from_file(args.tweets_file)
	note_tweets = extract_json_from_file(args.note_tweets_file)

	# Filter and sort tweets
	tweets = filter_edited_tweets(tweets)
	tweets.sort(key=lambda tweet: parse_tweet_time(tweet['tweet']['created_at']))

	# Apply retweet filter
	if args.exclude_retweets:
	tweets = [tweet for tweet in tweets if not tweet['tweet']['full_text'].startswith("RT @")]

	# Generate main page
	main_html = generate_html_content(
	tweets=tweets,
	note_tweets=note_tweets,
	name=args.name,
	twitter_username=args.twitter_username,
	page_title=f"{args.name}'s Tweets",
	back_link=("tweets_index.html", "View tweets by month")
	)
	with open("tweets.html", "w", encoding='utf-8') as f:
	f.write(main_html)

	# Group tweets by month
	grouped_tweets = {}
	for tweet in tweets:
	tweet_data = tweet['tweet']
	created_at = parse_tweet_time(tweet_data['created_at'])
	year = created_at.year
	month = created_at.month
	key = (year, month)
	if key not in grouped_tweets:
	grouped_tweets[key] = []
	grouped_tweets[key].append(tweet)

	# Generate monthly pages
	sorted_month_keys = sorted(grouped_tweets.keys(), reverse=True, key=lambda k: (k[0], k[1]))
	for key in sorted_month_keys:
	year, month = key
	month_tweets = grouped_tweets[key]
	month_title = f"{args.name}'s Tweets - {datetime(year, month, 1).strftime('%B %Y')}"
	filename = f"tweets_{year}_{month:02d}.html"

	monthly_html = generate_html_content(
	tweets=month_tweets,
	note_tweets=note_tweets,
	name=args.name,
	twitter_username=args.twitter_username,
	page_title=month_title,
	back_link=("tweets_index.html", "Back to Archive Index")
	)

	with open(filename, "w", encoding='utf-8') as f:
	f.write(monthly_html)

	# Generate index page
	index_html = generate_index_page(args.name, sorted_month_keys, grouped_tweets)
	with open("tweets_index.html", "w", encoding='utf-8') as f:
	f.write(index_html)

	print("HTML files 'tweets.html', monthly archives, and 'tweets_index.html' have been created.")

	if __name__ == "__main__":
	main()