harperreed · September 16, 2023 03:16
diff --git a/chain_summarization.py b/chain_summarization.py
 # Import required packages
 from dotenv import load_dotenv       # For managing environment variables
 from html2text import html2text      # For HTML to markdown conversion
 from readability import Document     # For summarizing HTML content
 from typing import List              # For type hinting
 import json                          # For JSON parsing
 import logging                       # For logging errors
 import openai                        # OpenAI GPT API
 import os                            # For OS-level operations
 import requests                      # For HTTP requests
 import tiktoken                      # For token counting


 # 
 #  ▄████▄   ██░ ██  ▄▄▄       ██▓ ███▄    █     ▒█████    █████▒                 
 # ▒██▀ ▀█  ▓██░ ██▒▒████▄    ▓██▒ ██ ▀█   █    ▒██▒  ██▒▓██   ▒                  
 # ▒▓█    ▄ ▒██▀▀██░▒██  ▀█▄  ▒██▒▓██  ▀█ ██▒   ▒██░  ██▒▒████ ░                  
 # ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒  ▐▌██▒   ▒██   ██░░▓█▒  ░                  
 # ▒ ▓███▀ ░░▓█▒░██▓ ▓█   ▓██▒░██░▒██░   ▓██░   ░ ████▓▒░░▒█░                     
 # ░ ░▒ ▒  ░ ▒ ░░▒░▒ ▒▒   ▓▒█░░▓  ░ ▒░   ▒ ▒    ░ ▒░▒░▒░  ▒ ░                     
 #   ░  ▒    ▒ ░▒░ ░  ▒   ▒▒ ░ ▒ ░░ ░░   ░ ▒░     ░ ▒ ▒░  ░                       
 # ░         ░  ░░ ░  ░   ▒    ▒ ░   ░   ░ ░    ░ ░ ░ ▒   ░ ░                     
 # ░ ░       ░  ░  ░      ░  ░ ░           ░        ░ ░                           
 # ░                                                                              
 # ▓█████▄ ▓█████  ███▄    █   ██████  ██▓▄▄▄█████▓▓██   ██▓                      
 # ▒██▀ ██▌▓█   ▀  ██ ▀█   █ ▒██    ▒ ▓██▒▓  ██▒ ▓▒ ▒██  ██▒                      
 # ░██   █▌▒███   ▓██  ▀█ ██▒░ ▓██▄   ▒██▒▒ ▓██░ ▒░  ▒██ ██░                      
 # ░▓█▄   ▌▒▓█  ▄ ▓██▒  ▐▌██▒  ▒   ██▒░██░░ ▓██▓ ░   ░ ▐██▓░                      
 # ░▒████▓ ░▒████▒▒██░   ▓██░▒██████▒▒░██░  ▒██▒ ░   ░ ██▒▓░                      
 #  ▒▒▓  ▒ ░░ ▒░ ░░ ▒░   ▒ ▒ ▒ ▒▓▒ ▒ ░░▓    ▒ ░░      ██▒▒▒                       
 #  ░ ▒  ▒  ░ ░  ░░ ░░   ░ ▒░░ ░▒  ░ ░ ▒ ░    ░     ▓██ ░▒░                       
 #  ░ ░  ░    ░      ░   ░ ░ ░  ░  ░   ▒ ░  ░       ▒ ▒ ░░                        
 #    ░       ░  ░         ░       ░   ░            ░ ░                           
 #  ░                                               ░ ░                           
 #  ██▓███   ██▀███   ▒█████   ███▄ ▄███▓ ██▓███  ▄▄▄█████▓ ██▓ ███▄    █   ▄████ 
 # ▓██░  ██▒▓██ ▒ ██▒▒██▒  ██▒▓██▒▀█▀ ██▒▓██░  ██▒▓  ██▒ ▓▒▓██▒ ██ ▀█   █  ██▒ ▀█▒
 # ▓██░ ██▓▒▓██ ░▄█ ▒▒██░  ██▒▓██    ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██  ▀█ ██▒▒██░▄▄▄░
 # ▒██▄█▓▒ ▒▒██▀▀█▄  ▒██   ██░▒██    ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒  ▐▌██▒░▓█  ██▓
 # ▒██▒ ░  ░░██▓ ▒██▒░ ████▓▒░▒██▒   ░██▒▒██▒ ░  ░  ▒██▒ ░ ░██░▒██░   ▓██░░▒▓███▀▒
 # ▒▓▒░ ░  ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░   ░  ░▒▓▒░ ░  ░  ▒ ░░   ░▓  ░ ▒░   ▒ ▒  ░▒   ▒ 
 # ░▒ ░       ░▒ ░ ▒░  ░ ▒ ▒░ ░  ░      ░░▒ ░         ░     ▒ ░░ ░░   ░ ▒░  ░   ░ 
 # ░░         ░░   ░ ░ ░ ░ ▒  ░      ░   ░░         ░       ▒ ░   ░   ░ ░ ░ ░   ░ 
 #             ░         ░ ░         ░                      ░           ░       ░ 
 #                                                                                

 # 
 # This snippet does summarization with the GPT-4 model using Chain of Density Prompting.
 # You can read more here: 
 # https://arxiv.org/pdf/2309.04269.pdf
 #
 # The script takes a URL as input, fetches the HTML content, and extracts the summary
 # from the HTML. The summary is then converted to markdown format and used as the prompt
 # for the GPT-4 model. The model generates a summary of the summary, which is then
 # printed to the console.
 #
 # The magic is the prompt template, which is the same as the one used in the paper.
 #

 # Load environment variables from the .env file
 load_dotenv()

 # Set OpenAI API key from environment variable
 openai.api_key = os.getenv("OPENAI_API_KEY")

 # Define token limit for OpenAI API
 openai_token_limit = 4000

 print("""
   ______  ____  _____  ______   ___  ________  ___ ______________  _  __
  / __/ / / /  |/  /  |/  / _ | / _ \/  _/_  / / _ /_  __/  _/ __ \/ |/ /
 _\ \/ /_/ / /|_/ / /|_/ / __ |/ , _// /  / /_/ __ |/ / _/ // /_/ /    / 
 /___/\____/_/  /_/_/  /_/_/ |_/_/|_/___/ /___/_/ |_/_/ /___/\____/_/|_/                                                               
      """)


 # Accept URL from user
 summary_url = input("Enter a URL: ")

 # Fetch webpage content
 response = requests.get(summary_url)

 # Extract summary from HTML content
 doc = Document(response.content)

 # Convert HTML summary to markdown format
 markdown_content = html2text(doc.summary())

 # Define prompt template. This is the magical prompt from the paper
 prompt = """
 Article: {}
 You will generate increasingly concise, entity-dense summaries of the above article. 

 Repeat the following 2 steps 5 times. 

 Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary. 
 Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities. 

 A missing entity is:
 - relevant to the main story, 
 - specific yet concise (5 words or fewer), 
 - novel (not in the previous summary), 
 - faithful (present in the article), 
 - anywhere (can be located anywhere in the article).

 Guidelines:

 - The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
 - Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
 - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
 - The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article. 
 - Missing entities can appear anywhere in the new summary.
 - Never drop entities from the previous summary. If space cannot be made, add fewer new entities. 

 Remember, use the exact same number of words for each summary.
 Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
 """

 # Initialize tokenizer and count tokens in markdown content
 encoding = tiktoken.get_encoding("cl100k_base")
 num_tokens = len(encoding.encode(markdown_content))

 # Print token count
 print(f"\nNum Tokens (content and prompt): {num_tokens}")

 # Check if token count exceeds OpenAI limit
 if num_tokens > openai_token_limit:
  # Truncate text to fit within token limit
  markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit])

 # Format the prompt with the markdown content
 summaryPrompt = prompt.format(markdown_content)

 print("Generating summary...\n")
 # Initialize the OpenAI API call
 try:
  response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
      {"role": "user", "content": summaryPrompt}
    ]
  )
 except Exception as e:  # Log exception if API call fails
  logging.exception("API call failed")

 # Extract the summary from the API response
 output = response.choices[0]["message"]['content']

 # Attempt to parse the summary as JSON and extract 'Denser_Summary'
 try:
  output = json.loads(output)
  output = output[-1]['Denser_Summary']
 except json.JSONDecodeError:  # Log exception if JSON parsing fails
  logging.exception("JSON decoding failed")

 # Print the final summary
 print(f"\nSummary:\n{output}\n\n")
	# Import required packages
	from dotenv import load_dotenv # For managing environment variables
	from html2text import html2text # For HTML to markdown conversion
	from readability import Document # For summarizing HTML content
	from typing import List # For type hinting
	import json # For JSON parsing
	import logging # For logging errors
	import openai # OpenAI GPT API
	import os # For OS-level operations
	import requests # For HTTP requests
	import tiktoken # For token counting


	#
	# ▄████▄ ██░ ██ ▄▄▄ ██▓ ███▄ █ ▒█████ █████▒
	# ▒██▀ ▀█ ▓██░ ██▒▒████▄ ▓██▒ ██ ▀█ █ ▒██▒ ██▒▓██ ▒
	# ▒▓█ ▄ ▒██▀▀██░▒██ ▀█▄ ▒██▒▓██ ▀█ ██▒ ▒██░ ██▒▒████ ░
	# ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒ ▐▌██▒ ▒██ ██░░▓█▒ ░
	# ▒ ▓███▀ ░░▓█▒░██▓ ▓█ ▓██▒░██░▒██░ ▓██░ ░ ████▓▒░░▒█░
	# ░ ░▒ ▒ ░ ▒ ░░▒░▒ ▒▒ ▓▒█░░▓ ░ ▒░ ▒ ▒ ░ ▒░▒░▒░ ▒ ░
	# ░ ▒ ▒ ░▒░ ░ ▒ ▒▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ▒ ▒░ ░
	# ░ ░ ░░ ░ ░ ▒ ▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░
	# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░
	# ░
	# ▓█████▄ ▓█████ ███▄ █ ██████ ██▓▄▄▄█████▓▓██ ██▓
	# ▒██▀ ██▌▓█ ▀ ██ ▀█ █ ▒██ ▒ ▓██▒▓ ██▒ ▓▒ ▒██ ██▒
	# ░██ █▌▒███ ▓██ ▀█ ██▒░ ▓██▄ ▒██▒▒ ▓██░ ▒░ ▒██ ██░
	# ░▓█▄ ▌▒▓█ ▄ ▓██▒ ▐▌██▒ ▒ ██▒░██░░ ▓██▓ ░ ░ ▐██▓░
	# ░▒████▓ ░▒████▒▒██░ ▓██░▒██████▒▒░██░ ▒██▒ ░ ░ ██▒▓░
	# ▒▒▓ ▒ ░░ ▒░ ░░ ▒░ ▒ ▒ ▒ ▒▓▒ ▒ ░░▓ ▒ ░░ ██▒▒▒
	# ░ ▒ ▒ ░ ░ ░░ ░░ ░ ▒░░ ░▒ ░ ░ ▒ ░ ░ ▓██ ░▒░
	# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ▒ ▒ ░░
	# ░ ░ ░ ░ ░ ░ ░ ░
	# ░ ░ ░
	# ██▓███ ██▀███ ▒█████ ███▄ ▄███▓ ██▓███ ▄▄▄█████▓ ██▓ ███▄ █ ▄████
	# ▓██░ ██▒▓██ ▒ ██▒▒██▒ ██▒▓██▒▀█▀ ██▒▓██░ ██▒▓ ██▒ ▓▒▓██▒ ██ ▀█ █ ██▒ ▀█▒
	# ▓██░ ██▓▒▓██ ░▄█ ▒▒██░ ██▒▓██ ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░
	# ▒██▄█▓▒ ▒▒██▀▀█▄ ▒██ ██░▒██ ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒ ▐▌██▒░▓█ ██▓
	# ▒██▒ ░ ░░██▓ ▒██▒░ ████▓▒░▒██▒ ░██▒▒██▒ ░ ░ ▒██▒ ░ ░██░▒██░ ▓██░░▒▓███▀▒
	# ▒▓▒░ ░ ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░ ░ ░▒▓▒░ ░ ░ ▒ ░░ ░▓ ░ ▒░ ▒ ▒ ░▒ ▒
	# ░▒ ░ ░▒ ░ ▒░ ░ ▒ ▒░ ░ ░ ░░▒ ░ ░ ▒ ░░ ░░ ░ ▒░ ░ ░
	# ░░ ░░ ░ ░ ░ ░ ▒ ░ ░ ░░ ░ ▒ ░ ░ ░ ░ ░ ░ ░
	# ░ ░ ░ ░ ░ ░ ░
	#

	#
	# This snippet does summarization with the GPT-4 model using Chain of Density Prompting.
	# You can read more here:
	# https://arxiv.org/pdf/2309.04269.pdf
	#
	# The script takes a URL as input, fetches the HTML content, and extracts the summary
	# from the HTML. The summary is then converted to markdown format and used as the prompt
	# for the GPT-4 model. The model generates a summary of the summary, which is then
	# printed to the console.
	#
	# The magic is the prompt template, which is the same as the one used in the paper.
	#

	# Load environment variables from the .env file
	load_dotenv()

	# Set OpenAI API key from environment variable
	openai.api_key = os.getenv("OPENAI_API_KEY")

	# Define token limit for OpenAI API
	openai_token_limit = 4000

	print("""
	______ ____ _____ ______ ___ ________ ___ ______________ _ __
	/ __/ / / / \|/ / \|/ / _ \| / _ \/ _/_ / / _ /_ __/ _/ __ \/ \|/ /
	_\ \/ /_/ / /\|_/ / /\|_/ / __ \|/ , _// / / /_/ __ \|/ / _/ // /_/ / /
	/___/\____/_/ /_/_/ /_/_/ \|_/_/\|_/___/ /___/_/ \|_/_/ /___/\____/_/\|_/
	""")


	# Accept URL from user
	summary_url = input("Enter a URL: ")

	# Fetch webpage content
	response = requests.get(summary_url)

	# Extract summary from HTML content
	doc = Document(response.content)

	# Convert HTML summary to markdown format
	markdown_content = html2text(doc.summary())

	# Define prompt template. This is the magical prompt from the paper
	prompt = """
	Article: {}
	You will generate increasingly concise, entity-dense summaries of the above article.

	Repeat the following 2 steps 5 times.

	Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary.
	Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities.

	A missing entity is:
	- relevant to the main story,
	- specific yet concise (5 words or fewer),
	- novel (not in the previous summary),
	- faithful (present in the article),
	- anywhere (can be located anywhere in the article).

	Guidelines:

	- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
	- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
	- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
	- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article.
	- Missing entities can appear anywhere in the new summary.
	- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

	Remember, use the exact same number of words for each summary.
	Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
	"""

	# Initialize tokenizer and count tokens in markdown content
	encoding = tiktoken.get_encoding("cl100k_base")
	num_tokens = len(encoding.encode(markdown_content))

	# Print token count
	print(f"\nNum Tokens (content and prompt): {num_tokens}")

	# Check if token count exceeds OpenAI limit
	if num_tokens > openai_token_limit:
	# Truncate text to fit within token limit
	markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit])

	# Format the prompt with the markdown content
	summaryPrompt = prompt.format(markdown_content)

	print("Generating summary...\n")
	# Initialize the OpenAI API call
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4",
	messages=[
	{"role": "user", "content": summaryPrompt}
	]
	)
	except Exception as e: # Log exception if API call fails
	logging.exception("API call failed")

	# Extract the summary from the API response
	output = response.choices[0]["message"]['content']

	# Attempt to parse the summary as JSON and extract 'Denser_Summary'
	try:
	output = json.loads(output)
	output = output[-1]['Denser_Summary']
	except json.JSONDecodeError: # Log exception if JSON parsing fails
	logging.exception("JSON decoding failed")

	# Print the final summary
	print(f"\nSummary:\n{output}\n\n")