Created
May 26, 2025 19:58
-
-
Save manzke/d9f90883440f43cb697e155b4c83a643 to your computer and use it in GitHub Desktop.
Ai-based enricher - take an image and let the LLMs tell you generate some metadata
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const sharp = require('sharp'); | |
const { OpenAI } = require('openai'); | |
const { Anthropic } = require('@anthropic-ai/sdk'); | |
const { GoogleGenAI } = require('@google/genai'); | |
// const { GoogleGenAI } = require('@google/genai'); // Comment out as we'll use @google/generative-ai | |
const yargs = require('yargs/yargs'); | |
const { hideBin } = require('yargs/helpers'); | |
const dotenv = require('dotenv'); | |
// Load environment variables - try multiple paths | |
const possibleEnvPaths = [ | |
path.resolve(__dirname, '.env'), // .env in the same directory as the script | |
path.resolve(__dirname, '../.env'), // .env in the parent directory | |
path.resolve(process.cwd(), '.env') // .env in the current working directory | |
]; | |
// Try loading from each possible path | |
let envLoaded = false; | |
for (const envPath of possibleEnvPaths) { | |
if (fs.existsSync(envPath)) { | |
console.log(`Loading environment variables from: ${envPath}`); | |
dotenv.config({ path: envPath }); | |
envLoaded = true; | |
break; | |
} | |
} | |
if (!envLoaded) { | |
console.warn('Warning: No .env file found. Make sure to provide environment variables.'); | |
} | |
// --- Shared Constants --- | |
const COMMON_ANALYSIS_PROMPT = `Analyze this image comprehensively. Based on its content, provide the following: | |
1. A descriptive title (max 10 words). | |
2. A short description (2-3 sentences). | |
3. A comprehensive summary. | |
4. A comprehensive fulltext detailing all visible elements, context, and any inferred information (as much detail as possible). | |
5. A list of possible entities (like named entity recognition - people, places, organizations, objects). | |
6. A possible document type (e.g., invoice, contract, photo, illustration, diagram, receipt, business card, etc.). | |
Please format your response as a JSON object with the following keys: 'title', 'description', 'summary', 'fulltext', 'entities' (as a list of strings), and 'document_type'.`; | |
// --- Helper Functions --- | |
/** | |
* Loads, resizes (max 1024px), and encodes an image file to base64 string | |
* @param {string} imagePath - Path to the image file | |
* @returns {Promise<string|null>} Base64 encoded string or null on error | |
*/ | |
async function encodeImageToBase64Resized(imagePath) { | |
try { | |
const image = sharp(imagePath); | |
const metadata = await image.metadata(); | |
const maxDim = 1024; | |
let resized = image; | |
if (metadata.width > maxDim || metadata.height > maxDim) { | |
resized = image.resize({ | |
width: metadata.width > metadata.height ? maxDim : undefined, | |
height: metadata.height >= metadata.width ? maxDim : undefined, | |
fit: 'inside', | |
withoutEnlargement: true | |
}); | |
} | |
// Always output as JPEG for best compatibility and compression | |
const buffer = await resized.jpeg({ quality: 80 }).toBuffer(); | |
return buffer.toString('base64'); | |
} catch (error) { | |
console.error(`Error encoding/resizing image ${imagePath}: ${error.message}`); | |
return null; | |
} | |
} | |
/** | |
* Analyze all images in a folder using the selected service | |
* @param {string} folderPath - Path to the folder containing images | |
* @param {string} service - Service to use ('google', 'openai', 'anthropic') | |
* @param {string} outputDir - Directory to save results | |
*/ | |
async function analyzeImagesInFolder(folderPath, service, outputDir) { | |
const supportedExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp']; | |
const files = fs.readdirSync(folderPath); | |
for (const file of files) { | |
const ext = path.extname(file).toLowerCase(); | |
if (!supportedExtensions.includes(ext)) continue; | |
const imagePath = path.join(folderPath, file); | |
console.log(`\n---\nAnalyzing: ${imagePath}`); | |
let analysisResult; | |
switch (service) { | |
case 'google': | |
analysisResult = await analyzeWithGoogle(imagePath); | |
break; | |
case 'openai': | |
analysisResult = await analyzeWithOpenAiAzure(imagePath); | |
break; | |
case 'anthropic': | |
analysisResult = await analyzeWithAnthropic(imagePath); | |
break; | |
default: | |
console.error(`Error: Unknown service '${service}'`); | |
continue; | |
} | |
if (analysisResult && Object.keys(analysisResult).length > 0) { | |
if (!fs.existsSync(outputDir)) { | |
fs.mkdirSync(outputDir, { recursive: true }); | |
} | |
const baseFilename = path.basename(imagePath, path.extname(imagePath)); | |
const outputFilename = path.join(outputDir, `${baseFilename}_${service}_analysis.json`); | |
try { | |
fs.writeFileSync(outputFilename, JSON.stringify(analysisResult, null, 2)); | |
console.log(`Analysis saved to: ${outputFilename}`); | |
} catch (error) { | |
console.error(`Error saving analysis to JSON: ${error.message}`); | |
} | |
} else { | |
console.log("No analysis result generated or an error occurred."); | |
} | |
} | |
} | |
/** | |
* Determines the media type of an image based on its extension | |
* @param {string} imagePath - Path to the image file | |
* @returns {string} MIME type of the image | |
*/ | |
function getMediaType(imagePath) { | |
const extension = path.extname(imagePath).toLowerCase(); | |
switch (extension) { | |
case '.png': | |
return 'image/png'; | |
case '.jpg': | |
case '.jpeg': | |
return 'image/jpeg'; | |
case '.gif': | |
return 'image/gif'; | |
case '.webp': | |
return 'image/webp'; | |
default: | |
console.warn(`Warning: Unknown image type for ${imagePath}, defaulting to image/jpeg. Consider adding more specific handling.`); | |
return 'image/jpeg'; | |
} | |
} | |
/** | |
* Parses JSON response text and handles potential errors | |
* @param {string} responseText - Response text to parse as JSON | |
* @param {string} serviceName - Name of the service for error reporting | |
* @returns {object} Parsed JSON object or error object | |
*/ | |
function parseJsonResponse(responseText, serviceName) { | |
try { | |
console.error(`Raw response text: ${responseText}`); | |
// Remove code block markers if present | |
let cleaned = responseText.trim(); | |
if (cleaned.startsWith('```')) { | |
cleaned = cleaned.replace(/^```[a-zA-Z]*\n?/, '').replace(/```$/, '').trim(); | |
} | |
return JSON.parse(cleaned); | |
} catch (error) { | |
console.error(`Error decoding JSON from ${serviceName} response: ${error.message}`); | |
console.error(`Raw response text: ${responseText}`); | |
return { | |
title: `Error: Could not parse JSON response from ${serviceName}`, | |
description: responseText, | |
fulltext: '', | |
entities: [], | |
document_type: 'Error' | |
}; | |
} | |
} | |
/** | |
* Validates that an image can be opened | |
* @param {string} imagePath - Path to the image file | |
* @returns {Promise<boolean>} True if valid, false otherwise | |
*/ | |
async function validateImage(imagePath) { | |
try { | |
await sharp(imagePath).metadata(); | |
console.log(`Successfully validated image: ${imagePath}`); | |
return true; | |
} catch (error) { | |
console.error(`Error: Cannot open or read image file ${imagePath}: ${error.message}`); | |
return false; | |
} | |
} | |
// --- AI Service Interaction Functions --- | |
/** | |
* Analyzes the image using Google Gemini Pro Vision | |
* @param {string} imagePath - Path to the image file | |
* @returns {Promise<object>} Analysis result | |
*/ | |
async function analyzeWithGoogle(imagePath) { | |
console.log(`Analyzing ${imagePath} with Google Gemini Pro Vision...`); | |
const apiKey = process.env.GOOGLE_API_KEY; | |
if (!apiKey) { | |
console.error("Error: GOOGLE_API_KEY not found in environment variables."); | |
return {}; | |
} | |
try { | |
// Validate API key | |
if (!apiKey || apiKey.trim() === '') { | |
console.error("Error: GOOGLE_API_KEY is empty or invalid."); | |
return {}; | |
} | |
// Initialize the new Google GenAI SDK | |
const ai = new GoogleGenAI({ apiKey }); | |
const modelName = 'gemini-2.5-flash-preview-05-20';//'gemini-2.0-flash'; | |
console.log(`Using Google Gemini model: ${modelName}`); | |
// Resize and encode image to base64 (JPEG, max 1024px) | |
const base64Image = await encodeImageToBase64Resized(imagePath); | |
if (!base64Image) { | |
return {}; | |
} | |
const mediaType = 'image/jpeg'; | |
// Use the new SDK's generateContent method | |
const result = await ai.models.generateContent({ | |
model: modelName, | |
contents: [ | |
{ | |
role: 'user', | |
parts: [ | |
{ text: COMMON_ANALYSIS_PROMPT }, | |
{ | |
inlineData: { | |
data: base64Image, | |
mimeType: mediaType | |
} | |
} | |
] | |
} | |
] | |
}); | |
// In the new SDK, result.text is a property, not a function | |
const responseText = result.text; | |
if (responseText) { | |
try { | |
return parseJsonResponse(responseText, "Google Gemini"); | |
} catch (parseError) { | |
console.error(`Error processing Gemini response: ${parseError.message}`); | |
console.error(`Raw response text: ${responseText}`); | |
return { | |
title: "Error parsing Gemini response", | |
description: "The response from Gemini could not be parsed as valid JSON.", | |
fulltext: responseText.substring(0, 200) + (responseText.length > 200 ? '...' : ''), | |
entities: [], | |
document_type: "Error" | |
}; | |
} | |
} else { | |
console.error("Error: No valid text content in response from Google Gemini."); | |
return {}; | |
} | |
} catch (error) { | |
console.error(`An unexpected error occurred in analyzeWithGoogle (Gemini): ${error.message}`); | |
// Handle specific model-related errors | |
if (error.message && error.message.includes('model')) { | |
console.error("This appears to be a model availability error."); | |
console.error("Please check if the model name is correct and available in your region."); | |
console.error("Available Gemini models include: gemini-1.5-pro, gemini-1.0-pro"); | |
} | |
// Handle API key issues | |
if (error.message && error.message.includes('API key')) { | |
console.error("Please check your GOOGLE_API_KEY environment variable."); | |
} | |
return {}; | |
} | |
} | |
/** | |
* Analyzes the image using OpenAI API on Azure | |
* @param {string} imagePath - Path to the image file | |
* @returns {Promise<object>} Analysis result | |
*/ | |
async function analyzeWithOpenAiAzure(imagePath) { | |
console.log(`Analyzing ${imagePath} with OpenAI on Azure...`); | |
const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT; | |
const apiKey = process.env.AZURE_OPENAI_KEY; | |
const deploymentName = process.env.AZURE_OPENAI_DEPLOYMENT_NAME; | |
if (!azureEndpoint || !apiKey || !deploymentName) { | |
console.error("Error: Azure OpenAI environment variables (AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, AZURE_OPENAI_DEPLOYMENT_NAME) not fully set."); | |
return {}; | |
} | |
try { | |
const client = new OpenAI({ | |
azure: { | |
apiKey, | |
endpoint: azureEndpoint, | |
deploymentName, | |
}, | |
apiVersion: '2024-02-01', | |
}); | |
const base64Image = await encodeImageToBase64Resized(imagePath); | |
if (!base64Image) { | |
return {}; | |
} | |
const mediaType = 'image/jpeg'; | |
const response = await client.chat.completions.create({ | |
model: deploymentName, | |
messages: [ | |
{ | |
role: 'user', | |
content: [ | |
{ type: 'text', text: COMMON_ANALYSIS_PROMPT }, | |
{ | |
type: 'image_url', | |
image_url: { url: `data:${mediaType};base64,${base64Image}` } | |
} | |
] | |
} | |
], | |
max_tokens: 2048 | |
}); | |
if (response.choices && | |
response.choices[0] && | |
response.choices[0].message && | |
response.choices[0].message.content) { | |
const responseText = response.choices[0].message.content; | |
return parseJsonResponse(responseText, "OpenAI"); | |
} else { | |
console.error("Error: Unexpected response structure from OpenAI."); | |
console.error(`Raw response: ${JSON.stringify(response)}`); | |
return {}; | |
} | |
} catch (error) { | |
console.error(`An unexpected error occurred in analyzeWithOpenAiAzure: ${error.message}`); | |
if (error.status) { | |
console.error(`Status code: ${error.status}`); | |
} | |
if (error.response) { | |
console.error(`Response: ${JSON.stringify(error.response)}`); | |
} | |
return {}; | |
} | |
} | |
/** | |
* Analyzes the image using Anthropic Claude API | |
* @param {string} imagePath - Path to the image file | |
* @returns {Promise<object>} Analysis result | |
*/ | |
async function analyzeWithAnthropic(imagePath) { | |
console.log(`Analyzing ${imagePath} with Anthropic Claude...`); | |
const apiKey = process.env.ANTHROPIC_API_KEY; | |
if (!apiKey) { | |
console.error("Error: ANTHROPIC_API_KEY not found in environment variables."); | |
return {}; | |
} | |
try { | |
const client = new Anthropic({ apiKey }); | |
const base64Image = await encodeImageToBase64Resized(imagePath); | |
if (!base64Image) { | |
return {}; | |
} | |
const mediaType = 'image/jpeg'; | |
const message = await client.messages.create({ | |
model: "claude-3-5-sonnet-20240620", | |
max_tokens: 2048, | |
messages: [ | |
{ | |
role: "user", | |
content: [ | |
{ | |
type: "image", | |
source: { | |
type: "base64", | |
media_type: mediaType, | |
data: base64Image | |
} | |
}, | |
{ type: "text", text: COMMON_ANALYSIS_PROMPT } | |
] | |
} | |
] | |
}); | |
if (message.content && | |
Array.isArray(message.content) && | |
message.content[0] && | |
message.content[0].text) { | |
const responseText = message.content[0].text; | |
return parseJsonResponse(responseText, "Anthropic"); | |
} else { | |
console.error("Error: Unexpected response structure from Anthropic."); | |
console.error(`Raw response: ${JSON.stringify(message)}`); | |
return {}; | |
} | |
} catch (error) { | |
console.error(`An unexpected error occurred in analyzeWithAnthropic: ${error.message}`); | |
if (error.status) { | |
console.error(`Status code: ${error.status}`); | |
} | |
return {}; | |
} | |
} | |
/** | |
* Main function to run the image analysis | |
*/ | |
async function main() { | |
// Parse command line arguments | |
const argv = yargs(hideBin(process.argv)) | |
.usage('Usage: $0 <image_path|folder_path> --service [service] [options]') | |
.option('service', { | |
alias: 's', | |
describe: 'AI service to use for analysis', | |
choices: ['google', 'openai', 'anthropic'], | |
demandOption: true | |
}) | |
.option('output-dir', { | |
alias: 'o', | |
describe: 'Directory to save the analysis results', | |
default: 'output' | |
}) | |
.option('folder', { | |
alias: 'f', | |
type: 'boolean', | |
describe: 'If set, treat the path as a folder and analyze all images inside' | |
}) | |
.demandCommand(1, 'Please provide the path to an image file or folder') | |
.help() | |
.argv; | |
const inputPath = argv._[0]; | |
const service = argv.service; | |
const outputDir = argv['output-dir']; | |
const isFolder = argv.folder; | |
if (isFolder) { | |
if (!fs.existsSync(inputPath) || !fs.statSync(inputPath).isDirectory()) { | |
console.error(`Error: Folder not found at ${inputPath}`); | |
return; | |
} | |
await analyzeImagesInFolder(inputPath, service, outputDir); | |
return; | |
} | |
// Single image mode (default) | |
if (!fs.existsSync(inputPath)) { | |
console.error(`Error: Image file not found at ${inputPath}`); | |
return; | |
} | |
const isValid = await validateImage(inputPath); | |
if (!isValid) { | |
return; | |
} | |
let analysisResult; | |
switch (service) { | |
case 'google': | |
analysisResult = await analyzeWithGoogle(inputPath); | |
break; | |
case 'openai': | |
analysisResult = await analyzeWithOpenAiAzure(inputPath); | |
break; | |
case 'anthropic': | |
analysisResult = await analyzeWithAnthropic(inputPath); | |
break; | |
default: | |
console.error(`Error: Unknown service '${service}'`); | |
return; | |
} | |
if (analysisResult && Object.keys(analysisResult).length > 0) { | |
console.log("--- Analysis Result ---"); | |
console.log(`Title: ${analysisResult.title || 'N/A'}`); | |
console.log(`Description: ${analysisResult.description || 'N/A'}`); | |
console.log(`Summary: ${analysisResult.summary || 'N/A'}`); | |
console.log(`Fulltext: ${analysisResult.fulltext || 'N/A'}`); | |
console.log(`Entities: ${JSON.stringify(analysisResult.entities || [])}`); | |
console.log(`Document Type: ${analysisResult.document_type || 'N/A'}`); | |
if (!fs.existsSync(outputDir)) { | |
fs.mkdirSync(outputDir, { recursive: true }); | |
} | |
const baseFilename = path.basename(inputPath, path.extname(inputPath)); | |
const outputFilename = path.join(outputDir, `${baseFilename}_${service}_analysis.json`); | |
try { | |
fs.writeFileSync(outputFilename, JSON.stringify(analysisResult, null, 2)); | |
console.log(`Analysis saved to: ${outputFilename}`); | |
} catch (error) { | |
console.error(`Error saving analysis to JSON: ${error.message}`); | |
} | |
} else { | |
console.log("No analysis result generated or an error occurred."); | |
} | |
} | |
// (end of file cleanup) | |
main().catch(error => { | |
console.error('Unexpected error:', error); | |
process.exit(1); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment