eloquentarduino · September 24, 2024 07:23
diff --git a/Read_PDF_Aloud_in_Colab.py b/Read_PDF_Aloud_in_Colab.py
 import base64
 import json
 import os
 import webbrowser
 from io import BytesIO

 import numpy as np
 from PIL import Image
 from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesisOutputFormat, SpeechSynthesizer
 from azure.cognitiveservices.speech.audio import AudioOutputConfig
 from openai import OpenAI
 from pdf2image import convert_from_bytes


 def get_pdf_bytes_colab() -> tuple[str, bytes]:
    from google.colab import files
    upload = files.upload()
    return list(upload.keys())[0], list(upload.values())[0]


 def get_pdf_bytes(file_path) -> tuple[str, bytes]:
    with open(file_path, 'rb') as file:
        return os.path.basename(file_path), file.read()


 def get_pdf_pages(pdf_bytes: bytes):
    """
    Extracts pages from a PDF file as PIL images.
    Displays the grid of images.
    :param pdf_bytes:
    :return:
    """
    pages = convert_from_bytes(pdf_bytes, dpi=200)
    image_width, image_height = pages[0].size
    num_columns = 4
    padding = image_width // 10
    num_rows = (len(pages) + num_columns - 1) // num_columns
    grid_width = num_columns * image_width + (num_columns + 1) * padding
    grid_height = num_rows * image_height + (num_rows + 1) * padding

    # Create a new blank image with a white background
    grid_image = Image.new('RGB', (grid_width, grid_height), (0, 0, 0))

    # Paste images into the grid
    for index, image in enumerate(pages):
        row = index // num_columns
        col = index % num_columns
        x = col * (image_width + padding)
        y = row * (image_height + padding)
        grid_image.paste(image, (x + padding, y + padding))

    grid_image.show()
    return pages


 def select_page(pages: list) -> Image.Image:
    while True:
        page_number = input(f"Che pagina vuoi leggere? [1-{len(pages)}] ")

        try:
            page_number = int(page_number) - 1
            assert 0 <= page_number < len(pages)
            return pages[page_number]
        except (ValueError, AssertionError):
            print("Si è verificato un errore")


 def extract_text(page):
    print("Converto la pagina in testo usando ChatGPT...")
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
    sys = """
      This image is a PDF page. It probably contains text. Your task is to extract the text verbatim from the page.
      Follow these instructions:
      - Format the extracted JSON as Markdown, if possible. Pay attention to bold words if possible.
      - Don't change any word, unless you cannot in any other way understand the text.
      - Only return the JSON response, nothing else.
      - If the page contains images or tables, try to extract their caption only: don't extract table data or image content.
      - If the page contains 2 columns of text at any point, try to extract text from the left column, then from the right column. Be sure you parse both columns, don't stop at the left column.
      - Ignore apices and footnotes.
      - Focus on the center of the page, not the edges.
      - Return the text as JSON with the following structure: [<block>], where <block> is a JSON object representing semantic blocks from the pdf. Each block may have a `title` extracted from the image, a `column` property if you can identify that the layout was bi-columnal. It is though mandatory that it has a `text` property. A good split point for blocks is lines or sentences that start with a number + dot, like "1. Introduction" or "1.1 Related work".

      An example response could be:
      [{"title": "Abstract", "column": "left", "text": "This is the **abstract** of the paper"}, {"title": "Introduction", "column": "right", "text": "This is the *introduction* of the paper"}]

      If you don't return valid JSON, I will be fired. Think carefully.
    """
    buffered = BytesIO()
    page.save(buffered, format="JPEG")
    b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
                {"type": "text", "text": sys}
            ]},
            {"role": "assistant", "content": '[{"text":"}]'}
        ]
    )

    return json.loads(response.choices[0].message.content)


 def parse_blocks(blocks: list) -> list:
    def is_valid(b: dict) -> bool:
        title = b.get("title", "").lower()

        if ("figure" in title or "table" in title) and len(title) < 10:
            return False

        return True

    blocks = [b for b in blocks if is_valid(b)]
    merged = blocks[:1]

    for b in blocks[1:]:
        if "title" not in b:
            merged[-1]["text"] += " " + b["text"]
        else:
            merged.append(b)

    return merged


 def synthesize(block: dict, voice: str, output_dir: str) -> tuple[str, str, str] | None:
    title = block.get("title", "")
    text = block["text"]
    output_dir = output_dir.replace("/", " ").replace(".pdf", "")
    output = f"{output_dir}/{voice}/{title}.wav"
    os.makedirs(os.path.dirname(output), exist_ok=True)

    if not os.path.exists(output):
        print("Sintetizzo il paragrafo...", title if title else "senza titolo")

        ssml_title = f"""<emphasis>{title}</emphasis><break time="2s" />""" if title else ""
        lang = voice[:5]
        ssml = f'''
            <speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{lang}">
                <voice name="{voice}">{ssml_title} {text}</voice>
            </speak>
        '''
        speech_config = SpeechConfig(subscription=os.environ["AZURE_KEY"], region="westeurope")
        speech_config.set_speech_synthesis_output_format(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)
        audio_config = AudioOutputConfig(filename=output)
        speech_config.speech_synthesis_voice_name = voice
        synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = synthesizer.speak_ssml(ssml)

        if result.cancellation_details is not None:
            print("Si è verificato un errore", result.cancellation_details.error_details)
            return None
    else:
        print("Un file audio per questa sezione esiste già. Skippo")

    return title, text, output


 def make_html(main_title: str, audio_files: list[tuple[str, str, str]]):
    def b64(file_path: str) -> str:
        with open(file_path, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")

    encoded_audio_files = [b64(audio[2]) for audio in audio_files]
    audio_elements = [f'<audio controls><source src="data:audio/wav;base64,{encoded_audio}" type="audio/wav"></audio>'
                      for encoded_audio in encoded_audio_files]
    blocks = [
        """<div class="border border-gray-200 rounded-lg"><h1 class="font-bold">{title}</h1><p class="text-sm">{text}</p>{e}</div>"""
        for (title, text, _), e in zip(audio_files, audio_elements)]

    html = f"""
    <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>{main_title}</title>
            <script src="https://cdn.tailwindcss.com"></script>
        </head>
        <body>
            <h1 class="text-2xl font-bold">{main_title}</h1>
            <div class="container p-4">
                <div class="flex flex-col gap-4">
                    {blocks}
                </div>
            </div>
        </body>
    </html>
    """

    with open(f"{main_title}.html", "w") as f:
        f.write(html)

    webbrowser.open(f"{main_title}.html")


 def read_aloud(pdf_name: str, pdf_bytes: bytes, voice: str):
    pages = get_pdf_pages(pdf_bytes)
    page = select_page(pages)
    blocks = parse_blocks(extract_text(page))
    audio_files = [synthesize(b, voice, output_dir=pdf_name) for b in blocks]
    audio_files = [f for f in audio_files if f is not None]
    print(f"Generati {len(audio_files)} file audio")
    make_html(pdf_name, audio_files)


 def how_to_use():
    pdf_name, pdf_input = get_pdf_bytes("path to pdf.pdf")
    read_aloud(pdf_name, pdf_input, voice="en-US-AvaMultilingualNeural")
	import base64
	import json
	import os
	import webbrowser
	from io import BytesIO

	import numpy as np
	from PIL import Image
	from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesisOutputFormat, SpeechSynthesizer
	from azure.cognitiveservices.speech.audio import AudioOutputConfig
	from openai import OpenAI
	from pdf2image import convert_from_bytes


	def get_pdf_bytes_colab() -> tuple[str, bytes]:
	from google.colab import files
	upload = files.upload()
	return list(upload.keys())[0], list(upload.values())[0]


	def get_pdf_bytes(file_path) -> tuple[str, bytes]:
	with open(file_path, 'rb') as file:
	return os.path.basename(file_path), file.read()


	def get_pdf_pages(pdf_bytes: bytes):
	"""
	Extracts pages from a PDF file as PIL images.
	Displays the grid of images.
	:param pdf_bytes:
	:return:
	"""
	pages = convert_from_bytes(pdf_bytes, dpi=200)
	image_width, image_height = pages[0].size
	num_columns = 4
	padding = image_width // 10
	num_rows = (len(pages) + num_columns - 1) // num_columns
	grid_width = num_columns * image_width + (num_columns + 1) * padding
	grid_height = num_rows * image_height + (num_rows + 1) * padding

	# Create a new blank image with a white background
	grid_image = Image.new('RGB', (grid_width, grid_height), (0, 0, 0))

	# Paste images into the grid
	for index, image in enumerate(pages):
	row = index // num_columns
	col = index % num_columns
	x = col * (image_width + padding)
	y = row * (image_height + padding)
	grid_image.paste(image, (x + padding, y + padding))

	grid_image.show()
	return pages


	def select_page(pages: list) -> Image.Image:
	while True:
	page_number = input(f"Che pagina vuoi leggere? [1-{len(pages)}] ")

	try:
	page_number = int(page_number) - 1
	assert 0 <= page_number < len(pages)
	return pages[page_number]
	except (ValueError, AssertionError):
	print("Si è verificato un errore")


	def extract_text(page):
	print("Converto la pagina in testo usando ChatGPT...")
	client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
	sys = """
	This image is a PDF page. It probably contains text. Your task is to extract the text verbatim from the page.
	Follow these instructions:
	- Format the extracted JSON as Markdown, if possible. Pay attention to bold words if possible.
	- Don't change any word, unless you cannot in any other way understand the text.
	- Only return the JSON response, nothing else.
	- If the page contains images or tables, try to extract their caption only: don't extract table data or image content.
	- If the page contains 2 columns of text at any point, try to extract text from the left column, then from the right column. Be sure you parse both columns, don't stop at the left column.
	- Ignore apices and footnotes.
	- Focus on the center of the page, not the edges.
	- Return the text as JSON with the following structure: [<block>], where <block> is a JSON object representing semantic blocks from the pdf. Each block may have a `title` extracted from the image, a `column` property if you can identify that the layout was bi-columnal. It is though mandatory that it has a `text` property. A good split point for blocks is lines or sentences that start with a number + dot, like "1. Introduction" or "1.1 Related work".

	An example response could be:
	[{"title": "Abstract", "column": "left", "text": "This is the abstract of the paper"}, {"title": "Introduction", "column": "right", "text": "This is the introduction of the paper"}]

	If you don't return valid JSON, I will be fired. Think carefully.
	"""
	buffered = BytesIO()
	page.save(buffered, format="JPEG")
	b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{"role": "user", "content": [
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
	{"type": "text", "text": sys}
	]},
	{"role": "assistant", "content": '[{"text":"}]'}
	]
	)

	return json.loads(response.choices[0].message.content)


	def parse_blocks(blocks: list) -> list:
	def is_valid(b: dict) -> bool:
	title = b.get("title", "").lower()

	if ("figure" in title or "table" in title) and len(title) < 10:
	return False

	return True

	blocks = [b for b in blocks if is_valid(b)]
	merged = blocks[:1]

	for b in blocks[1:]:
	if "title" not in b:
	merged[-1]["text"] += " " + b["text"]
	else:
	merged.append(b)

	return merged


	def synthesize(block: dict, voice: str, output_dir: str) -> tuple[str, str, str] \| None:
	title = block.get("title", "")
	text = block["text"]
	output_dir = output_dir.replace("/", " ").replace(".pdf", "")
	output = f"{output_dir}/{voice}/{title}.wav"
	os.makedirs(os.path.dirname(output), exist_ok=True)

	if not os.path.exists(output):
	print("Sintetizzo il paragrafo...", title if title else "senza titolo")

	ssml_title = f"""<emphasis>{title}</emphasis><break time="2s" />""" if title else ""
	lang = voice[:5]
	ssml = f'''
	<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{lang}">
	<voice name="{voice}">{ssml_title} {text}</voice>
	</speak>
	'''
	speech_config = SpeechConfig(subscription=os.environ["AZURE_KEY"], region="westeurope")
	speech_config.set_speech_synthesis_output_format(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)
	audio_config = AudioOutputConfig(filename=output)
	speech_config.speech_synthesis_voice_name = voice
	synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
	result = synthesizer.speak_ssml(ssml)

	if result.cancellation_details is not None:
	print("Si è verificato un errore", result.cancellation_details.error_details)
	return None
	else:
	print("Un file audio per questa sezione esiste già. Skippo")

	return title, text, output


	def make_html(main_title: str, audio_files: list[tuple[str, str, str]]):
	def b64(file_path: str) -> str:
	with open(file_path, "rb") as f:
	return base64.b64encode(f.read()).decode("utf-8")

	encoded_audio_files = [b64(audio[2]) for audio in audio_files]
	audio_elements = [f'<audio controls><source src="data:audio/wav;base64,{encoded_audio}" type="audio/wav"></audio>'
	for encoded_audio in encoded_audio_files]
	blocks = [
	"""<div class="border border-gray-200 rounded-lg"><h1 class="font-bold">{title}</h1><p class="text-sm">{text}</p>{e}</div>"""
	for (title, text, _), e in zip(audio_files, audio_elements)]

	html = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>{main_title}</title>
	<script src="https://cdn.tailwindcss.com"></script>
	</head>
	<body>
	<h1 class="text-2xl font-bold">{main_title}</h1>
	<div class="container p-4">
	<div class="flex flex-col gap-4">
	{blocks}
	</div>
	</div>
	</body>
	</html>
	"""

	with open(f"{main_title}.html", "w") as f:
	f.write(html)

	webbrowser.open(f"{main_title}.html")


	def read_aloud(pdf_name: str, pdf_bytes: bytes, voice: str):
	pages = get_pdf_pages(pdf_bytes)
	page = select_page(pages)
	blocks = parse_blocks(extract_text(page))
	audio_files = [synthesize(b, voice, output_dir=pdf_name) for b in blocks]
	audio_files = [f for f in audio_files if f is not None]
	print(f"Generati {len(audio_files)} file audio")
	make_html(pdf_name, audio_files)


	def how_to_use():
	pdf_name, pdf_input = get_pdf_bytes("path to pdf.pdf")
	read_aloud(pdf_name, pdf_input, voice="en-US-AvaMultilingualNeural")