Created
January 19, 2025 07:40
-
-
Save oshea00/a7bc3590cdc9b7e8862a9589b119b26a to your computer and use it in GitHub Desktop.
Example using textTract
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# @meta version 1.0.0 | |
# @option --pdf path to pdf file. | |
# The file is processed and saved as 'raw-text.txt' | |
# in the current directory. | |
`eval "$(argc --argc-eval "$0" "$@")"` | |
if [ -z "${argc_pdf}" ]; then | |
echo "Try --help" | |
exit 1 | |
fi | |
PDFDOC=$argc_pdf | |
aws s3 cp $PDFDOC s3://upload-docs.limpidfox.com/upload.pdf | |
# Variables | |
BUCKET_NAME="upload-docs.limpidfox.com" | |
DOCUMENT_NAME="upload.pdf" | |
REGION="us-east-1" # Change to your desired region | |
# Start the document text detection | |
JOB_ID=$(aws textract start-document-text-detection \ | |
--document-location '{"S3Object":{"Bucket":"'"$BUCKET_NAME"'","Name":"'"$DOCUMENT_NAME"'"}}' \ | |
--region "$REGION" \ | |
--query "JobId" \ | |
--output text) | |
echo "Job started with JobId: $JOB_ID" | |
# Polling loop to check job status | |
STATUS="IN_PROGRESS" | |
while [ "$STATUS" == "IN_PROGRESS" ]; do | |
echo "Waiting for job to complete..." | |
sleep 5 # Wait for 5 seconds before checking again | |
STATUS=$(aws textract get-document-text-detection \ | |
--job-id "$JOB_ID" \ | |
--region "$REGION" \ | |
--query "JobStatus" \ | |
--output text) | |
echo "Current status: $STATUS" | |
done | |
# Check if the job completed successfully | |
if [ "$STATUS" == "SUCCEEDED" ]; then | |
echo "Job succeeded. Fetching results..." | |
# Fetch the full results | |
aws textract get-document-text-detection \ | |
--job-id "$JOB_ID" \ | |
--region "$REGION" > textract-results.json | |
echo "Results saved to textract-results.json" | |
# Optionally extract raw text using jq | |
jq -r '.Blocks[] | select(.BlockType == "LINE") | .Text' textract-results.json > raw-text.txt | |
echo "Raw text saved to raw-text.txt" | |
else | |
echo "Job failed with status: $STATUS" | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment