Created
February 11, 2025 15:24
-
-
Save do-me/13a034757071cf6c91deda809acc33ee to your computer and use it in GitHub Desktop.
Docling bash script for converting a directory of pdfs to a directory of textfiles and a single LLM-ingestable text file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Set the input directory (where your PDFs are) | |
INPUT_DIR="." # Current directory, change if needed | |
# Set the output file name | |
OUTPUT_FILE="llm_ready.txt" | |
# Set the temporary directory | |
TEMP_DIR="temp_pdf_text" | |
# Define the start and end tags | |
START_TAG="<DOCUMENT_START>" | |
END_TAG="<DOCUMENT_END>" | |
SEPARATOR="\n\n--------------------------------------------------------------------------------\n\n" # Added separator | |
# Create the temporary directory if it doesn't exist | |
mkdir -p "$TEMP_DIR" | |
# Clear the output file if it exists | |
> "$OUTPUT_FILE" | |
# Get the total number of PDF files | |
TOTAL_FILES=$(find "$INPUT_DIR" -name "*.pdf" | wc -l) | |
FILE_COUNT=0 | |
START_TIME=$(date +%s) | |
ELAPSED_SUM=0 | |
# Loop through each PDF file in the directory | |
find "$INPUT_DIR" -name "*.pdf" -print0 | while IFS= read -r -d $'\0' pdf_file; do | |
# Increment the file count | |
FILE_COUNT=$((FILE_COUNT + 1)) | |
# Create a temporary output file name for each PDF | |
temp_file="$TEMP_DIR/$(basename "$pdf_file" .pdf).txt" | |
# Start time for the current document | |
DOC_START_TIME=$(date +%s) | |
# Convert the PDF to text using docling | |
docling --from pdf --to text "$pdf_file" --output "$TEMP_DIR" --num-threads 14 | |
# End time for the current document | |
DOC_END_TIME=$(date +%s) | |
# Elapsed time for the current document | |
ELAPSED=$((DOC_END_TIME - DOC_START_TIME)) | |
# Add start tag, content, end tag, and separator to the output file | |
echo "$START_TAG" >> "$OUTPUT_FILE" | |
cat "$temp_file" >> "$OUTPUT_FILE" | |
echo "$END_TAG" >> "$OUTPUT_FILE" | |
echo "$SEPARATOR" >> "$OUTPUT_FILE" | |
# Calculate the progress percentage | |
PROGRESS=$((FILE_COUNT * 100 / TOTAL_FILES)) | |
# Calculate the average time per document | |
ELAPSED_SUM=$((ELAPSED_SUM + ELAPSED)) | |
AVG_TIME=$((ELAPSED_SUM / FILE_COUNT)) | |
# Calculate the remaining files | |
REMAINING_FILES=$((TOTAL_FILES - FILE_COUNT)) | |
# Calculate the estimated remaining time in seconds | |
ESTIMATED_REMAINING_SECONDS=$((AVG_TIME * REMAINING_FILES)) | |
# Convert estimated remaining time to HH:MM:SS format | |
ESTIMATED_REMAINING=$(printf "%02d:%02d:%02d" $((ESTIMATED_REMAINING_SECONDS / 3600)) $(( (ESTIMATED_REMAINING_SECONDS % 3600) / 60 )) $((ESTIMATED_REMAINING_SECONDS % 60)) ) | |
# Print progress information | |
printf "Progress: %3d%% (%d/%d) | Avg: %ds | ETA: %s | Processing: %s\r" "$PROGRESS" "$FILE_COUNT" "$TOTAL_FILES" "$AVG_TIME" "$ESTIMATED_REMAINING" "$(basename "$pdf_file")" | |
done | |
# Print a newline to clear the progress line | |
echo | |
END_TIME=$(date +%s) | |
TOTAL_SECONDS=$((END_TIME - START_TIME)) | |
TOTAL_TIME=$(printf "%02d:%02d:%02d" $((TOTAL_SECONDS / 3600)) $(( (TOTAL_SECONDS % 3600) / 60 )) $((TOTAL_SECONDS % 60)) ) | |
echo "Conversion complete." | |
echo "Total time: $TOTAL_TIME" | |
echo "Output file: $OUTPUT_FILE" | |
echo "Temporary files are located in: $TEMP_DIR" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
With progress bar in this format: