Created
January 9, 2025 15:56
-
-
Save cmoscardi/6f43b89778703deb9fcec92720740af4 to your computer and use it in GitHub Desktop.
MS API batch process script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import json | |
import os | |
from azure.ai.formrecognizer import DocumentAnalysisClient | |
from azure.core.credentials import AzureKeyCredential | |
# Azure Form Recognizer credentials | |
AZURE_ENDPOINT = "https://<your-endpoint-name>.cognitiveservices.azure.com/" | |
AZURE_API_KEY = "<your-api-key>" | |
document_analysis_client = DocumentAnalysisClient( | |
endpoint=AZURE_ENDPOINT, | |
credential=AzureKeyCredential(AZURE_API_KEY) | |
) | |
def analyze_document(file_path): | |
""" | |
Analyze a single document using Azure Document Intelligence. | |
""" | |
try: | |
with open(file_path, "rb") as document: | |
poller = document_analysis_client.begin_analyze_document("prebuilt-read", document, pages="1-2") | |
result = poller.result() | |
print(f"Analysis results for {file_path}:\n") | |
for page in result.pages: | |
print(f"Page number: {page.page_number}") | |
print(f"Page width: {page.width}, height: {page.height}, unit: {page.unit}") | |
return result | |
except Exception as e: | |
print(f"An error occurred while processing {file_path}: {e}") | |
def process_batch(directory_path): | |
""" | |
Process a batch of documents in a directory. | |
""" | |
if not os.path.exists(directory_path): | |
print("The specified directory does not exist.") | |
return | |
for file_path in glob.glob(f"{directory_path}/*.pdf"): | |
fname = os.path.basename(file_path) | |
print("-" * 40) | |
print(f"Processing document: {file_path}") | |
result = analyze_document(file_path) | |
out_json_fname = f"{directory_path}/{fname}.processed.json" | |
out_txt_fname = f"{directory_path}/{fname}.content.txt" | |
with open(out_json_fname, "w+") as out_json: | |
json.dump(result.to_dict(), out_json) | |
print(f"Wrote json result {out_json_fname}") | |
with open(out_txt_fname, "w+") as out_txt: | |
out_txt.write(result.content) | |
print(f"Wrote txt of content {out_txt_fname}") | |
print("-" * 40) | |
if __name__ == "__main__": | |
# Directory containing documents to process | |
documents_directory = "<your-directory-path>" | |
process_batch(documents_directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment