Skip to content

Instantly share code, notes, and snippets.

@cmoscardi
Created January 9, 2025 15:56
Show Gist options
  • Save cmoscardi/6f43b89778703deb9fcec92720740af4 to your computer and use it in GitHub Desktop.
Save cmoscardi/6f43b89778703deb9fcec92720740af4 to your computer and use it in GitHub Desktop.
MS API batch process script
import glob
import json
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
# Azure Form Recognizer credentials
AZURE_ENDPOINT = "https://<your-endpoint-name>.cognitiveservices.azure.com/"
AZURE_API_KEY = "<your-api-key>"
document_analysis_client = DocumentAnalysisClient(
endpoint=AZURE_ENDPOINT,
credential=AzureKeyCredential(AZURE_API_KEY)
)
def analyze_document(file_path):
"""
Analyze a single document using Azure Document Intelligence.
"""
try:
with open(file_path, "rb") as document:
poller = document_analysis_client.begin_analyze_document("prebuilt-read", document, pages="1-2")
result = poller.result()
print(f"Analysis results for {file_path}:\n")
for page in result.pages:
print(f"Page number: {page.page_number}")
print(f"Page width: {page.width}, height: {page.height}, unit: {page.unit}")
return result
except Exception as e:
print(f"An error occurred while processing {file_path}: {e}")
def process_batch(directory_path):
"""
Process a batch of documents in a directory.
"""
if not os.path.exists(directory_path):
print("The specified directory does not exist.")
return
for file_path in glob.glob(f"{directory_path}/*.pdf"):
fname = os.path.basename(file_path)
print("-" * 40)
print(f"Processing document: {file_path}")
result = analyze_document(file_path)
out_json_fname = f"{directory_path}/{fname}.processed.json"
out_txt_fname = f"{directory_path}/{fname}.content.txt"
with open(out_json_fname, "w+") as out_json:
json.dump(result.to_dict(), out_json)
print(f"Wrote json result {out_json_fname}")
with open(out_txt_fname, "w+") as out_txt:
out_txt.write(result.content)
print(f"Wrote txt of content {out_txt_fname}")
print("-" * 40)
if __name__ == "__main__":
# Directory containing documents to process
documents_directory = "<your-directory-path>"
process_batch(documents_directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment