Last active
September 24, 2024 07:23
-
-
Save eloquentarduino/e98f21b74871f9ce37e7205f779aef68 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import json | |
import os | |
import webbrowser | |
from io import BytesIO | |
import numpy as np | |
from PIL import Image | |
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesisOutputFormat, SpeechSynthesizer | |
from azure.cognitiveservices.speech.audio import AudioOutputConfig | |
from openai import OpenAI | |
from pdf2image import convert_from_bytes | |
def get_pdf_bytes_colab() -> tuple[str, bytes]: | |
from google.colab import files | |
upload = files.upload() | |
return list(upload.keys())[0], list(upload.values())[0] | |
def get_pdf_bytes(file_path) -> tuple[str, bytes]: | |
with open(file_path, 'rb') as file: | |
return os.path.basename(file_path), file.read() | |
def get_pdf_pages(pdf_bytes: bytes): | |
""" | |
Extracts pages from a PDF file as PIL images. | |
Displays the grid of images. | |
:param pdf_bytes: | |
:return: | |
""" | |
pages = convert_from_bytes(pdf_bytes, dpi=200) | |
image_width, image_height = pages[0].size | |
num_columns = 4 | |
padding = image_width // 10 | |
num_rows = (len(pages) + num_columns - 1) // num_columns | |
grid_width = num_columns * image_width + (num_columns + 1) * padding | |
grid_height = num_rows * image_height + (num_rows + 1) * padding | |
# Create a new blank image with a white background | |
grid_image = Image.new('RGB', (grid_width, grid_height), (0, 0, 0)) | |
# Paste images into the grid | |
for index, image in enumerate(pages): | |
row = index // num_columns | |
col = index % num_columns | |
x = col * (image_width + padding) | |
y = row * (image_height + padding) | |
grid_image.paste(image, (x + padding, y + padding)) | |
grid_image.show() | |
return pages | |
def select_page(pages: list) -> Image.Image: | |
while True: | |
page_number = input(f"Che pagina vuoi leggere? [1-{len(pages)}] ") | |
try: | |
page_number = int(page_number) - 1 | |
assert 0 <= page_number < len(pages) | |
return pages[page_number] | |
except (ValueError, AssertionError): | |
print("Si è verificato un errore") | |
def extract_text(page): | |
print("Converto la pagina in testo usando ChatGPT...") | |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) | |
sys = """ | |
This image is a PDF page. It probably contains text. Your task is to extract the text verbatim from the page. | |
Follow these instructions: | |
- Format the extracted JSON as Markdown, if possible. Pay attention to bold words if possible. | |
- Don't change any word, unless you cannot in any other way understand the text. | |
- Only return the JSON response, nothing else. | |
- If the page contains images or tables, try to extract their caption only: don't extract table data or image content. | |
- If the page contains 2 columns of text at any point, try to extract text from the left column, then from the right column. Be sure you parse both columns, don't stop at the left column. | |
- Ignore apices and footnotes. | |
- Focus on the center of the page, not the edges. | |
- Return the text as JSON with the following structure: [<block>], where <block> is a JSON object representing semantic blocks from the pdf. Each block may have a `title` extracted from the image, a `column` property if you can identify that the layout was bi-columnal. It is though mandatory that it has a `text` property. A good split point for blocks is lines or sentences that start with a number + dot, like "1. Introduction" or "1.1 Related work". | |
An example response could be: | |
[{"title": "Abstract", "column": "left", "text": "This is the **abstract** of the paper"}, {"title": "Introduction", "column": "right", "text": "This is the *introduction* of the paper"}] | |
If you don't return valid JSON, I will be fired. Think carefully. | |
""" | |
buffered = BytesIO() | |
page.save(buffered, format="JPEG") | |
b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{"role": "user", "content": [ | |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}, | |
{"type": "text", "text": sys} | |
]}, | |
{"role": "assistant", "content": '[{"text":"}]'} | |
] | |
) | |
return json.loads(response.choices[0].message.content) | |
def parse_blocks(blocks: list) -> list: | |
def is_valid(b: dict) -> bool: | |
title = b.get("title", "").lower() | |
if ("figure" in title or "table" in title) and len(title) < 10: | |
return False | |
return True | |
blocks = [b for b in blocks if is_valid(b)] | |
merged = blocks[:1] | |
for b in blocks[1:]: | |
if "title" not in b: | |
merged[-1]["text"] += " " + b["text"] | |
else: | |
merged.append(b) | |
return merged | |
def synthesize(block: dict, voice: str, output_dir: str) -> tuple[str, str, str] | None: | |
title = block.get("title", "") | |
text = block["text"] | |
output_dir = output_dir.replace("/", " ").replace(".pdf", "") | |
output = f"{output_dir}/{voice}/{title}.wav" | |
os.makedirs(os.path.dirname(output), exist_ok=True) | |
if not os.path.exists(output): | |
print("Sintetizzo il paragrafo...", title if title else "senza titolo") | |
ssml_title = f"""<emphasis>{title}</emphasis><break time="2s" />""" if title else "" | |
lang = voice[:5] | |
ssml = f''' | |
<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{lang}"> | |
<voice name="{voice}">{ssml_title} {text}</voice> | |
</speak> | |
''' | |
speech_config = SpeechConfig(subscription=os.environ["AZURE_KEY"], region="westeurope") | |
speech_config.set_speech_synthesis_output_format(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm) | |
audio_config = AudioOutputConfig(filename=output) | |
speech_config.speech_synthesis_voice_name = voice | |
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) | |
result = synthesizer.speak_ssml(ssml) | |
if result.cancellation_details is not None: | |
print("Si è verificato un errore", result.cancellation_details.error_details) | |
return None | |
else: | |
print("Un file audio per questa sezione esiste già. Skippo") | |
return title, text, output | |
def make_html(main_title: str, audio_files: list[tuple[str, str, str]]): | |
def b64(file_path: str) -> str: | |
with open(file_path, "rb") as f: | |
return base64.b64encode(f.read()).decode("utf-8") | |
encoded_audio_files = [b64(audio[2]) for audio in audio_files] | |
audio_elements = [f'<audio controls><source src="data:audio/wav;base64,{encoded_audio}" type="audio/wav"></audio>' | |
for encoded_audio in encoded_audio_files] | |
blocks = [ | |
"""<div class="border border-gray-200 rounded-lg"><h1 class="font-bold">{title}</h1><p class="text-sm">{text}</p>{e}</div>""" | |
for (title, text, _), e in zip(audio_files, audio_elements)] | |
html = f""" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>{main_title}</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
</head> | |
<body> | |
<h1 class="text-2xl font-bold">{main_title}</h1> | |
<div class="container p-4"> | |
<div class="flex flex-col gap-4"> | |
{blocks} | |
</div> | |
</div> | |
</body> | |
</html> | |
""" | |
with open(f"{main_title}.html", "w") as f: | |
f.write(html) | |
webbrowser.open(f"{main_title}.html") | |
def read_aloud(pdf_name: str, pdf_bytes: bytes, voice: str): | |
pages = get_pdf_pages(pdf_bytes) | |
page = select_page(pages) | |
blocks = parse_blocks(extract_text(page)) | |
audio_files = [synthesize(b, voice, output_dir=pdf_name) for b in blocks] | |
audio_files = [f for f in audio_files if f is not None] | |
print(f"Generati {len(audio_files)} file audio") | |
make_html(pdf_name, audio_files) | |
def how_to_use(): | |
pdf_name, pdf_input = get_pdf_bytes("path to pdf.pdf") | |
read_aloud(pdf_name, pdf_input, voice="en-US-AvaMultilingualNeural") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment