Skip to content

Instantly share code, notes, and snippets.

@eloquentarduino
Last active September 24, 2024 07:23
Show Gist options
  • Save eloquentarduino/e98f21b74871f9ce37e7205f779aef68 to your computer and use it in GitHub Desktop.
Save eloquentarduino/e98f21b74871f9ce37e7205f779aef68 to your computer and use it in GitHub Desktop.
import base64
import json
import os
import webbrowser
from io import BytesIO
import numpy as np
from PIL import Image
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesisOutputFormat, SpeechSynthesizer
from azure.cognitiveservices.speech.audio import AudioOutputConfig
from openai import OpenAI
from pdf2image import convert_from_bytes
def get_pdf_bytes_colab() -> tuple[str, bytes]:
from google.colab import files
upload = files.upload()
return list(upload.keys())[0], list(upload.values())[0]
def get_pdf_bytes(file_path) -> tuple[str, bytes]:
with open(file_path, 'rb') as file:
return os.path.basename(file_path), file.read()
def get_pdf_pages(pdf_bytes: bytes):
"""
Extracts pages from a PDF file as PIL images.
Displays the grid of images.
:param pdf_bytes:
:return:
"""
pages = convert_from_bytes(pdf_bytes, dpi=200)
image_width, image_height = pages[0].size
num_columns = 4
padding = image_width // 10
num_rows = (len(pages) + num_columns - 1) // num_columns
grid_width = num_columns * image_width + (num_columns + 1) * padding
grid_height = num_rows * image_height + (num_rows + 1) * padding
# Create a new blank image with a white background
grid_image = Image.new('RGB', (grid_width, grid_height), (0, 0, 0))
# Paste images into the grid
for index, image in enumerate(pages):
row = index // num_columns
col = index % num_columns
x = col * (image_width + padding)
y = row * (image_height + padding)
grid_image.paste(image, (x + padding, y + padding))
grid_image.show()
return pages
def select_page(pages: list) -> Image.Image:
while True:
page_number = input(f"Che pagina vuoi leggere? [1-{len(pages)}] ")
try:
page_number = int(page_number) - 1
assert 0 <= page_number < len(pages)
return pages[page_number]
except (ValueError, AssertionError):
print("Si è verificato un errore")
def extract_text(page):
print("Converto la pagina in testo usando ChatGPT...")
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
sys = """
This image is a PDF page. It probably contains text. Your task is to extract the text verbatim from the page.
Follow these instructions:
- Format the extracted JSON as Markdown, if possible. Pay attention to bold words if possible.
- Don't change any word, unless you cannot in any other way understand the text.
- Only return the JSON response, nothing else.
- If the page contains images or tables, try to extract their caption only: don't extract table data or image content.
- If the page contains 2 columns of text at any point, try to extract text from the left column, then from the right column. Be sure you parse both columns, don't stop at the left column.
- Ignore apices and footnotes.
- Focus on the center of the page, not the edges.
- Return the text as JSON with the following structure: [<block>], where <block> is a JSON object representing semantic blocks from the pdf. Each block may have a `title` extracted from the image, a `column` property if you can identify that the layout was bi-columnal. It is though mandatory that it has a `text` property. A good split point for blocks is lines or sentences that start with a number + dot, like "1. Introduction" or "1.1 Related work".
An example response could be:
[{"title": "Abstract", "column": "left", "text": "This is the **abstract** of the paper"}, {"title": "Introduction", "column": "right", "text": "This is the *introduction* of the paper"}]
If you don't return valid JSON, I will be fired. Think carefully.
"""
buffered = BytesIO()
page.save(buffered, format="JPEG")
b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
{"type": "text", "text": sys}
]},
{"role": "assistant", "content": '[{"text":"}]'}
]
)
return json.loads(response.choices[0].message.content)
def parse_blocks(blocks: list) -> list:
def is_valid(b: dict) -> bool:
title = b.get("title", "").lower()
if ("figure" in title or "table" in title) and len(title) < 10:
return False
return True
blocks = [b for b in blocks if is_valid(b)]
merged = blocks[:1]
for b in blocks[1:]:
if "title" not in b:
merged[-1]["text"] += " " + b["text"]
else:
merged.append(b)
return merged
def synthesize(block: dict, voice: str, output_dir: str) -> tuple[str, str, str] | None:
title = block.get("title", "")
text = block["text"]
output_dir = output_dir.replace("/", " ").replace(".pdf", "")
output = f"{output_dir}/{voice}/{title}.wav"
os.makedirs(os.path.dirname(output), exist_ok=True)
if not os.path.exists(output):
print("Sintetizzo il paragrafo...", title if title else "senza titolo")
ssml_title = f"""<emphasis>{title}</emphasis><break time="2s" />""" if title else ""
lang = voice[:5]
ssml = f'''
<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{lang}">
<voice name="{voice}">{ssml_title} {text}</voice>
</speak>
'''
speech_config = SpeechConfig(subscription=os.environ["AZURE_KEY"], region="westeurope")
speech_config.set_speech_synthesis_output_format(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)
audio_config = AudioOutputConfig(filename=output)
speech_config.speech_synthesis_voice_name = voice
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
result = synthesizer.speak_ssml(ssml)
if result.cancellation_details is not None:
print("Si è verificato un errore", result.cancellation_details.error_details)
return None
else:
print("Un file audio per questa sezione esiste già. Skippo")
return title, text, output
def make_html(main_title: str, audio_files: list[tuple[str, str, str]]):
def b64(file_path: str) -> str:
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
encoded_audio_files = [b64(audio[2]) for audio in audio_files]
audio_elements = [f'<audio controls><source src="data:audio/wav;base64,{encoded_audio}" type="audio/wav"></audio>'
for encoded_audio in encoded_audio_files]
blocks = [
"""<div class="border border-gray-200 rounded-lg"><h1 class="font-bold">{title}</h1><p class="text-sm">{text}</p>{e}</div>"""
for (title, text, _), e in zip(audio_files, audio_elements)]
html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{main_title}</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body>
<h1 class="text-2xl font-bold">{main_title}</h1>
<div class="container p-4">
<div class="flex flex-col gap-4">
{blocks}
</div>
</div>
</body>
</html>
"""
with open(f"{main_title}.html", "w") as f:
f.write(html)
webbrowser.open(f"{main_title}.html")
def read_aloud(pdf_name: str, pdf_bytes: bytes, voice: str):
pages = get_pdf_pages(pdf_bytes)
page = select_page(pages)
blocks = parse_blocks(extract_text(page))
audio_files = [synthesize(b, voice, output_dir=pdf_name) for b in blocks]
audio_files = [f for f in audio_files if f is not None]
print(f"Generati {len(audio_files)} file audio")
make_html(pdf_name, audio_files)
def how_to_use():
pdf_name, pdf_input = get_pdf_bytes("path to pdf.pdf")
read_aloud(pdf_name, pdf_input, voice="en-US-AvaMultilingualNeural")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment