Created
May 31, 2024 05:41
-
-
Save mdavalos1993/85180de03d80b78bad9709577c066fd3 to your computer and use it in GitHub Desktop.
Summarize PDF file.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyOUEZESUVOhfvvFldE8Y8Iy", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/mdavalos1993/85180de03d80b78bad9709577c066fd3/summarize-pdf-file.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Este proyecto de Google Colab está diseñado para facilitar la carga de un archivo PDF, generar un resumen del contenido y convertir dicho resumen a un archivo de audio.\n", | |
"\n", | |
"**IMPORTANTE:**\n", | |
"Debes declarar una variable ('OPEN_AI_KEY') con la API Key de tu cuenta en OpenAI.\n", | |
"\n", | |
"\n", | |
"" | |
], | |
"metadata": { | |
"id": "ju9RnpsBJZWi" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"1- install libraries" | |
], | |
"metadata": { | |
"id": "jyhZskwo_Zg-" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"collapsed": true, | |
"id": "71A1cw1h_QWZ", | |
"outputId": "e0bf8053-f295-4589-fc52-fd69f30bb79f" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting openai\n", | |
" Downloading openai-1.30.5-py3-none-any.whl (320 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n", | |
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n", | |
"Collecting httpx<1,>=0.23.0 (from openai)\n", | |
" Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (2.7.1)\n", | |
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.1)\n", | |
"Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.4)\n", | |
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from openai) (4.11.0)\n", | |
"Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.7)\n", | |
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n", | |
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2024.2.2)\n", | |
"Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n", | |
" Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n", | |
" Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (0.7.0)\n", | |
"Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (2.18.2)\n", | |
"Installing collected packages: h11, httpcore, httpx, openai\n", | |
"Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 openai-1.30.5\n", | |
"Collecting PyMuPDF\n", | |
" Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hCollecting PyMuPDFb==1.24.3 (from PyMuPDF)\n", | |
" Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.8/15.8 MB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hInstalling collected packages: PyMuPDFb, PyMuPDF\n", | |
"Successfully installed PyMuPDF-1.24.5 PyMuPDFb-1.24.3\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install openai\n", | |
"!pip install PyMuPDF" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"2- upload pdf file" | |
], | |
"metadata": { | |
"id": "CATOpWa0_2ZN" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import files\n", | |
"\n", | |
"bookUploaded = files.upload()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 74 | |
}, | |
"collapsed": true, | |
"id": "dRsk2m3e_hIk", | |
"outputId": "76d8ef68-a733-47de-894c-c5c68d9198ed" | |
}, | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
], | |
"text/html": [ | |
"\n", | |
" <input type=\"file\" id=\"files-40fc46c8-d66e-4239-a8ab-a6f2d3e7e01f\" name=\"files[]\" multiple disabled\n", | |
" style=\"border:none\" />\n", | |
" <output id=\"result-40fc46c8-d66e-4239-a8ab-a6f2d3e7e01f\">\n", | |
" Upload widget is only available when the cell has been executed in the\n", | |
" current browser session. Please rerun this cell to enable.\n", | |
" </output>\n", | |
" <script>// Copyright 2017 Google LLC\n", | |
"//\n", | |
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n", | |
"// you may not use this file except in compliance with the License.\n", | |
"// You may obtain a copy of the License at\n", | |
"//\n", | |
"// http://www.apache.org/licenses/LICENSE-2.0\n", | |
"//\n", | |
"// Unless required by applicable law or agreed to in writing, software\n", | |
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n", | |
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", | |
"// See the License for the specific language governing permissions and\n", | |
"// limitations under the License.\n", | |
"\n", | |
"/**\n", | |
" * @fileoverview Helpers for google.colab Python module.\n", | |
" */\n", | |
"(function(scope) {\n", | |
"function span(text, styleAttributes = {}) {\n", | |
" const element = document.createElement('span');\n", | |
" element.textContent = text;\n", | |
" for (const key of Object.keys(styleAttributes)) {\n", | |
" element.style[key] = styleAttributes[key];\n", | |
" }\n", | |
" return element;\n", | |
"}\n", | |
"\n", | |
"// Max number of bytes which will be uploaded at a time.\n", | |
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n", | |
"\n", | |
"function _uploadFiles(inputId, outputId) {\n", | |
" const steps = uploadFilesStep(inputId, outputId);\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" // Cache steps on the outputElement to make it available for the next call\n", | |
" // to uploadFilesContinue from Python.\n", | |
" outputElement.steps = steps;\n", | |
"\n", | |
" return _uploadFilesContinue(outputId);\n", | |
"}\n", | |
"\n", | |
"// This is roughly an async generator (not supported in the browser yet),\n", | |
"// where there are multiple asynchronous steps and the Python side is going\n", | |
"// to poll for completion of each step.\n", | |
"// This uses a Promise to block the python side on completion of each step,\n", | |
"// then passes the result of the previous step as the input to the next step.\n", | |
"function _uploadFilesContinue(outputId) {\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" const steps = outputElement.steps;\n", | |
"\n", | |
" const next = steps.next(outputElement.lastPromiseValue);\n", | |
" return Promise.resolve(next.value.promise).then((value) => {\n", | |
" // Cache the last promise value to make it available to the next\n", | |
" // step of the generator.\n", | |
" outputElement.lastPromiseValue = value;\n", | |
" return next.value.response;\n", | |
" });\n", | |
"}\n", | |
"\n", | |
"/**\n", | |
" * Generator function which is called between each async step of the upload\n", | |
" * process.\n", | |
" * @param {string} inputId Element ID of the input file picker element.\n", | |
" * @param {string} outputId Element ID of the output display.\n", | |
" * @return {!Iterable<!Object>} Iterable of next steps.\n", | |
" */\n", | |
"function* uploadFilesStep(inputId, outputId) {\n", | |
" const inputElement = document.getElementById(inputId);\n", | |
" inputElement.disabled = false;\n", | |
"\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" outputElement.innerHTML = '';\n", | |
"\n", | |
" const pickedPromise = new Promise((resolve) => {\n", | |
" inputElement.addEventListener('change', (e) => {\n", | |
" resolve(e.target.files);\n", | |
" });\n", | |
" });\n", | |
"\n", | |
" const cancel = document.createElement('button');\n", | |
" inputElement.parentElement.appendChild(cancel);\n", | |
" cancel.textContent = 'Cancel upload';\n", | |
" const cancelPromise = new Promise((resolve) => {\n", | |
" cancel.onclick = () => {\n", | |
" resolve(null);\n", | |
" };\n", | |
" });\n", | |
"\n", | |
" // Wait for the user to pick the files.\n", | |
" const files = yield {\n", | |
" promise: Promise.race([pickedPromise, cancelPromise]),\n", | |
" response: {\n", | |
" action: 'starting',\n", | |
" }\n", | |
" };\n", | |
"\n", | |
" cancel.remove();\n", | |
"\n", | |
" // Disable the input element since further picks are not allowed.\n", | |
" inputElement.disabled = true;\n", | |
"\n", | |
" if (!files) {\n", | |
" return {\n", | |
" response: {\n", | |
" action: 'complete',\n", | |
" }\n", | |
" };\n", | |
" }\n", | |
"\n", | |
" for (const file of files) {\n", | |
" const li = document.createElement('li');\n", | |
" li.append(span(file.name, {fontWeight: 'bold'}));\n", | |
" li.append(span(\n", | |
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n", | |
" `last modified: ${\n", | |
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n", | |
" 'n/a'} - `));\n", | |
" const percent = span('0% done');\n", | |
" li.appendChild(percent);\n", | |
"\n", | |
" outputElement.appendChild(li);\n", | |
"\n", | |
" const fileDataPromise = new Promise((resolve) => {\n", | |
" const reader = new FileReader();\n", | |
" reader.onload = (e) => {\n", | |
" resolve(e.target.result);\n", | |
" };\n", | |
" reader.readAsArrayBuffer(file);\n", | |
" });\n", | |
" // Wait for the data to be ready.\n", | |
" let fileData = yield {\n", | |
" promise: fileDataPromise,\n", | |
" response: {\n", | |
" action: 'continue',\n", | |
" }\n", | |
" };\n", | |
"\n", | |
" // Use a chunked sending to avoid message size limits. See b/62115660.\n", | |
" let position = 0;\n", | |
" do {\n", | |
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n", | |
" const chunk = new Uint8Array(fileData, position, length);\n", | |
" position += length;\n", | |
"\n", | |
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n", | |
" yield {\n", | |
" response: {\n", | |
" action: 'append',\n", | |
" file: file.name,\n", | |
" data: base64,\n", | |
" },\n", | |
" };\n", | |
"\n", | |
" let percentDone = fileData.byteLength === 0 ?\n", | |
" 100 :\n", | |
" Math.round((position / fileData.byteLength) * 100);\n", | |
" percent.textContent = `${percentDone}% done`;\n", | |
"\n", | |
" } while (position < fileData.byteLength);\n", | |
" }\n", | |
"\n", | |
" // All done.\n", | |
" yield {\n", | |
" response: {\n", | |
" action: 'complete',\n", | |
" }\n", | |
" };\n", | |
"}\n", | |
"\n", | |
"scope.google = scope.google || {};\n", | |
"scope.google.colab = scope.google.colab || {};\n", | |
"scope.google.colab._files = {\n", | |
" _uploadFiles,\n", | |
" _uploadFilesContinue,\n", | |
"};\n", | |
"})(self);\n", | |
"</script> " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Saving git.pdf to git.pdf\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import fitz\n", | |
"\n", | |
"filePath = list(bookUploaded.keys())[0]\n", | |
"document = fitz.open(filePath)\n", | |
"\n", | |
"pagesContent = []\n", | |
"for page_num in range(document.page_count):\n", | |
" page = document.load_page(page_num)\n", | |
" text = page.get_text()\n", | |
" concatenatedText = f\"page {page_num + 1}\\n{text}\\n\"\n", | |
" pagesContent.append(concatenatedText)\n", | |
"document.close()\n", | |
"\n", | |
"print(f\"page count: {len(pagesContent)}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"collapsed": true, | |
"id": "QvVadsHWBW-o", | |
"outputId": "33cb8df3-3f7f-470e-b4cc-f86fbdb550a2" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"page count: 1\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"4- generate summary" | |
], | |
"metadata": { | |
"id": "T-Z2Ju_ZD6rd" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from openai import OpenAI\n", | |
"from pathlib import Path\n", | |
"from google.colab import userdata\n", | |
"\n", | |
"#add your OPEN_AI_KEY on 'Secrets'\n", | |
"client = OpenAI(api_key=userdata.get(\"OPEN_AI_KEY\"))\n", | |
"\n", | |
"concatenatedPagesContent = \"\\n\".join(pagesContent)\n", | |
"\n", | |
"textToSummary = f\"summarize the following article {concatenatedPagesContent}\"\n", | |
"response = client.chat.completions.create(\n", | |
" model=\"gpt-3.5-turbo\",\n", | |
" messages=[\n", | |
" {\"role\": \"user\", \"content\": textToSummary},\n", | |
" ]\n", | |
")\n", | |
"summary = response.choices[0].message.content\n", | |
"print(summary)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"collapsed": true, | |
"id": "g9erj8ImD5fV", | |
"outputId": "deb4e68b-ec9b-4f1f-f7d1-f0847ba41ff8" | |
}, | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"The second edition of Pro Git addresses the changes and updates in the Git community since the first edition was published over four years ago. Git has become more widely adopted, with improvements in Windows support, graphical user interfaces, IDE support, and business use. The Open Source community has also seen exponential growth, with platforms like GitHub hosting millions of projects and developers. The rise of the HTTP protocol for Git network transactions has simplified the process. The updated edition includes a deeper look at GitHub and its use in the Git community. The author, Scott Chacon, hopes readers will find this edition helpful in navigating the evolving world of Git.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"5- generate audio" | |
], | |
"metadata": { | |
"id": "rSJODPLMIab1" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"speech_file_path = Path(f\"/content/summary.mp3\")\n", | |
"\n", | |
"audioResponse = client.audio.speech.create(\n", | |
" model=\"tts-1\",\n", | |
" voice=\"fable\",\n", | |
" input= summary\n", | |
" )\n", | |
"audioResponse.stream_to_file(speech_file_path)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"collapsed": true, | |
"id": "BLOl_bkfIZgK", | |
"outputId": "0db94c9f-26b8-4c26-e2ca-a192a3440ee0" | |
}, | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"<ipython-input-13-0b308da95c47>:8: DeprecationWarning: Due to a bug, this method doesn't actually stream the response content, `.with_streaming_response.method()` should be used instead\n", | |
" audioResponse.stream_to_file(speech_file_path)\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment