Created
September 16, 2023 03:16
-
-
Save harperreed/b7b1ae56df4a269925e0668b823693eb to your computer and use it in GitHub Desktop.
Chain of Density Summarization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import required packages | |
from dotenv import load_dotenv # For managing environment variables | |
from html2text import html2text # For HTML to markdown conversion | |
from readability import Document # For summarizing HTML content | |
from typing import List # For type hinting | |
import json # For JSON parsing | |
import logging # For logging errors | |
import openai # OpenAI GPT API | |
import os # For OS-level operations | |
import requests # For HTTP requests | |
import tiktoken # For token counting | |
# | |
# ▄████▄ ██░ ██ ▄▄▄ ██▓ ███▄ █ ▒█████ █████▒ | |
# ▒██▀ ▀█ ▓██░ ██▒▒████▄ ▓██▒ ██ ▀█ █ ▒██▒ ██▒▓██ ▒ | |
# ▒▓█ ▄ ▒██▀▀██░▒██ ▀█▄ ▒██▒▓██ ▀█ ██▒ ▒██░ ██▒▒████ ░ | |
# ▒▓▓▄ ▄██▒░▓█ ░██ ░██▄▄▄▄██ ░██░▓██▒ ▐▌██▒ ▒██ ██░░▓█▒ ░ | |
# ▒ ▓███▀ ░░▓█▒░██▓ ▓█ ▓██▒░██░▒██░ ▓██░ ░ ████▓▒░░▒█░ | |
# ░ ░▒ ▒ ░ ▒ ░░▒░▒ ▒▒ ▓▒█░░▓ ░ ▒░ ▒ ▒ ░ ▒░▒░▒░ ▒ ░ | |
# ░ ▒ ▒ ░▒░ ░ ▒ ▒▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ▒ ▒░ ░ | |
# ░ ░ ░░ ░ ░ ▒ ▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ | |
# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ | |
# ░ | |
# ▓█████▄ ▓█████ ███▄ █ ██████ ██▓▄▄▄█████▓▓██ ██▓ | |
# ▒██▀ ██▌▓█ ▀ ██ ▀█ █ ▒██ ▒ ▓██▒▓ ██▒ ▓▒ ▒██ ██▒ | |
# ░██ █▌▒███ ▓██ ▀█ ██▒░ ▓██▄ ▒██▒▒ ▓██░ ▒░ ▒██ ██░ | |
# ░▓█▄ ▌▒▓█ ▄ ▓██▒ ▐▌██▒ ▒ ██▒░██░░ ▓██▓ ░ ░ ▐██▓░ | |
# ░▒████▓ ░▒████▒▒██░ ▓██░▒██████▒▒░██░ ▒██▒ ░ ░ ██▒▓░ | |
# ▒▒▓ ▒ ░░ ▒░ ░░ ▒░ ▒ ▒ ▒ ▒▓▒ ▒ ░░▓ ▒ ░░ ██▒▒▒ | |
# ░ ▒ ▒ ░ ░ ░░ ░░ ░ ▒░░ ░▒ ░ ░ ▒ ░ ░ ▓██ ░▒░ | |
# ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ▒ ▒ ░░ | |
# ░ ░ ░ ░ ░ ░ ░ ░ | |
# ░ ░ ░ | |
# ██▓███ ██▀███ ▒█████ ███▄ ▄███▓ ██▓███ ▄▄▄█████▓ ██▓ ███▄ █ ▄████ | |
# ▓██░ ██▒▓██ ▒ ██▒▒██▒ ██▒▓██▒▀█▀ ██▒▓██░ ██▒▓ ██▒ ▓▒▓██▒ ██ ▀█ █ ██▒ ▀█▒ | |
# ▓██░ ██▓▒▓██ ░▄█ ▒▒██░ ██▒▓██ ▓██░▓██░ ██▓▒▒ ▓██░ ▒░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░ | |
# ▒██▄█▓▒ ▒▒██▀▀█▄ ▒██ ██░▒██ ▒██ ▒██▄█▓▒ ▒░ ▓██▓ ░ ░██░▓██▒ ▐▌██▒░▓█ ██▓ | |
# ▒██▒ ░ ░░██▓ ▒██▒░ ████▓▒░▒██▒ ░██▒▒██▒ ░ ░ ▒██▒ ░ ░██░▒██░ ▓██░░▒▓███▀▒ | |
# ▒▓▒░ ░ ░░ ▒▓ ░▒▓░░ ▒░▒░▒░ ░ ▒░ ░ ░▒▓▒░ ░ ░ ▒ ░░ ░▓ ░ ▒░ ▒ ▒ ░▒ ▒ | |
# ░▒ ░ ░▒ ░ ▒░ ░ ▒ ▒░ ░ ░ ░░▒ ░ ░ ▒ ░░ ░░ ░ ▒░ ░ ░ | |
# ░░ ░░ ░ ░ ░ ░ ▒ ░ ░ ░░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ | |
# ░ ░ ░ ░ ░ ░ ░ | |
# | |
# | |
# This snippet does summarization with the GPT-4 model using Chain of Density Prompting. | |
# You can read more here: | |
# https://arxiv.org/pdf/2309.04269.pdf | |
# | |
# The script takes a URL as input, fetches the HTML content, and extracts the summary | |
# from the HTML. The summary is then converted to markdown format and used as the prompt | |
# for the GPT-4 model. The model generates a summary of the summary, which is then | |
# printed to the console. | |
# | |
# The magic is the prompt template, which is the same as the one used in the paper. | |
# | |
# Load environment variables from the .env file | |
load_dotenv() | |
# Set OpenAI API key from environment variable | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
# Define token limit for OpenAI API | |
openai_token_limit = 4000 | |
print(""" | |
______ ____ _____ ______ ___ ________ ___ ______________ _ __ | |
/ __/ / / / |/ / |/ / _ | / _ \/ _/_ / / _ /_ __/ _/ __ \/ |/ / | |
_\ \/ /_/ / /|_/ / /|_/ / __ |/ , _// / / /_/ __ |/ / _/ // /_/ / / | |
/___/\____/_/ /_/_/ /_/_/ |_/_/|_/___/ /___/_/ |_/_/ /___/\____/_/|_/ | |
""") | |
# Accept URL from user | |
summary_url = input("Enter a URL: ") | |
# Fetch webpage content | |
response = requests.get(summary_url) | |
# Extract summary from HTML content | |
doc = Document(response.content) | |
# Convert HTML summary to markdown format | |
markdown_content = html2text(doc.summary()) | |
# Define prompt template. This is the magical prompt from the paper | |
prompt = """ | |
Article: {} | |
You will generate increasingly concise, entity-dense summaries of the above article. | |
Repeat the following 2 steps 5 times. | |
Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary. | |
Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities. | |
A missing entity is: | |
- relevant to the main story, | |
- specific yet concise (5 words or fewer), | |
- novel (not in the previous summary), | |
- faithful (present in the article), | |
- anywhere (can be located anywhere in the article). | |
Guidelines: | |
- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words. | |
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities. | |
- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses". | |
- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article. | |
- Missing entities can appear anywhere in the new summary. | |
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities. | |
Remember, use the exact same number of words for each summary. | |
Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary". | |
""" | |
# Initialize tokenizer and count tokens in markdown content | |
encoding = tiktoken.get_encoding("cl100k_base") | |
num_tokens = len(encoding.encode(markdown_content)) | |
# Print token count | |
print(f"\nNum Tokens (content and prompt): {num_tokens}") | |
# Check if token count exceeds OpenAI limit | |
if num_tokens > openai_token_limit: | |
# Truncate text to fit within token limit | |
markdown_content = encoding.decode(encoding.encode(markdown_content)[:openai_token_limit]) | |
# Format the prompt with the markdown content | |
summaryPrompt = prompt.format(markdown_content) | |
print("Generating summary...\n") | |
# Initialize the OpenAI API call | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[ | |
{"role": "user", "content": summaryPrompt} | |
] | |
) | |
except Exception as e: # Log exception if API call fails | |
logging.exception("API call failed") | |
# Extract the summary from the API response | |
output = response.choices[0]["message"]['content'] | |
# Attempt to parse the summary as JSON and extract 'Denser_Summary' | |
try: | |
output = json.loads(output) | |
output = output[-1]['Denser_Summary'] | |
except json.JSONDecodeError: # Log exception if JSON parsing fails | |
logging.exception("JSON decoding failed") | |
# Print the final summary | |
print(f"\nSummary:\n{output}\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment