Created
July 13, 2024 07:08
-
-
Save daveebbelaar/05fe2c89adb6824807cc57aa64dc2d4b to your computer and use it in GitHub Desktop.
A service class for interacting with Azure Document Intelligence API.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import requests | |
import time | |
from typing import Union, Dict | |
from config.settings import get_settings | |
class DocumentIntelligenceService: | |
""" | |
A service class for interacting with Azure Document Intelligence API. | |
This class provides methods to analyze documents using Azure's Document Intelligence service. | |
""" | |
def __init__(self): | |
""" | |
Initialize the DocumentIntelligenceService with API credentials and endpoint. | |
""" | |
settings = get_settings() | |
self.key = settings.document_intelligence.api_key | |
self.endpoint = settings.document_intelligence.endpoint | |
self.api_version = "2024-02-29-preview" # Currently only available in East US, West US2, and West Europe | |
def analyze( | |
self, | |
source: Union[str, bytes], | |
is_url: bool = True, | |
model_id: str = "prebuilt-layout", | |
) -> Dict: | |
""" | |
Analyze a document using Azure Document Intelligence. | |
Args: | |
source (Union[str, bytes]): The document source, either a URL or base64 encoded content. | |
is_url (bool): True if the source is a URL, False if it's base64 encoded content. | |
model_id (str): The ID of the model to use for analysis. | |
Returns: | |
Dict: The analysis results. | |
Raises: | |
requests.HTTPError: If the API request fails. | |
""" | |
result_id = self._submit_analysis(source, is_url, model_id) | |
return self._get_analysis_results(result_id, model_id) | |
def _submit_analysis( | |
self, source: Union[str, bytes], is_url: bool, model_id: str | |
) -> str: | |
""" | |
Submit a document for analysis to Azure Document Intelligence. | |
Args: | |
source (Union[str, bytes]): The document source, either a URL or base64 encoded content. | |
is_url (bool): True if the source is a URL, False if it's base64 encoded content. | |
model_id (str): The ID of the model to use for analysis. | |
Returns: | |
str: The result ID for the submitted analysis. | |
Raises: | |
ValueError: If the Operation-Location header is missing in the response. | |
requests.HTTPError: If the API request fails. | |
""" | |
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown" | |
headers = { | |
"Content-Type": "application/json", | |
"Ocp-Apim-Subscription-Key": self.key, | |
} | |
data = {"urlSource": source} if is_url else {"base64Source": source} | |
logging.info("Submitting document for analysis") | |
response = requests.post(url, headers=headers, json=data) | |
response.raise_for_status() | |
operation_location = response.headers.get("Operation-Location") | |
if not operation_location: | |
raise ValueError("Operation-Location header is missing in the response.") | |
return operation_location.split("/")[-1].split("?")[0] | |
def _get_analysis_results(self, result_id: str, model_id: str) -> Dict: | |
""" | |
Retrieve the analysis results from Azure Document Intelligence. | |
Args: | |
result_id (str): The ID of the analysis result to retrieve. | |
model_id (str): The ID of the model used for analysis. | |
Returns: | |
Dict: The analysis results. | |
Raises: | |
requests.HTTPError: If the API request fails. | |
""" | |
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown" | |
headers = {"Ocp-Apim-Subscription-Key": self.key} | |
while True: | |
logging.info("Waiting for analysis to complete.") | |
time.sleep(2) | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
data = response.json() | |
if data.get("status") in ["succeeded", "failed"]: | |
return data | |
if __name__ == "__main__": | |
# Example usage of the DocumentIntelligenceService | |
client = DocumentIntelligenceService() | |
analysis_results = client.analyze( | |
source="https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf" | |
) | |
print(analysis_results.keys()) | |
print(analysis_results["analyzeResult"].keys()) | |
print(analysis_results["analyzeResult"]["content"]) | |
print(analysis_results["analyzeResult"]["tables"]) |
You can use this file below or just replace it with loading your environment variables directly.
from pydantic_settings import BaseSettings
from functools import lru_cache
import logging
import sys
from dotenv import load_dotenv
import os
load_dotenv()
@lru_cache
def get_settings():
return Settings()
class DocumentIntelligenceSettings(BaseSettings):
api_key: str = os.getenv("DOCUMENT_INTELLIGENCE_API_KEY")
endpoint: str = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
class Settings(BaseSettings):
document_intelligence: DocumentIntelligenceSettings = DocumentIntelligenceSettings()
Sorry, but I was watching your video were you explain this, there any possibility, can bring the file pdf.ingester.py that you mentioned in the video as a possible use case?
Sorry, but I was watching your video were you explain this, there any possibility, can bring the file pdf.ingester.py that you mentioned in the video as a possible use case?
I also need the same file, pdf_ingester.py I want to try and use it my use case. could you please provide it here?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
where can we get the config.settings ?