Skip to content

Instantly share code, notes, and snippets.

@manzke
Created May 26, 2025 19:58
Show Gist options
  • Save manzke/d9f90883440f43cb697e155b4c83a643 to your computer and use it in GitHub Desktop.
Save manzke/d9f90883440f43cb697e155b4c83a643 to your computer and use it in GitHub Desktop.
Ai-based enricher - take an image and let the LLMs tell you generate some metadata
const fs = require('fs');
const path = require('path');
const sharp = require('sharp');
const { OpenAI } = require('openai');
const { Anthropic } = require('@anthropic-ai/sdk');
const { GoogleGenAI } = require('@google/genai');
// const { GoogleGenAI } = require('@google/genai'); // Comment out as we'll use @google/generative-ai
const yargs = require('yargs/yargs');
const { hideBin } = require('yargs/helpers');
const dotenv = require('dotenv');
// Load environment variables - try multiple paths
const possibleEnvPaths = [
path.resolve(__dirname, '.env'), // .env in the same directory as the script
path.resolve(__dirname, '../.env'), // .env in the parent directory
path.resolve(process.cwd(), '.env') // .env in the current working directory
];
// Try loading from each possible path
let envLoaded = false;
for (const envPath of possibleEnvPaths) {
if (fs.existsSync(envPath)) {
console.log(`Loading environment variables from: ${envPath}`);
dotenv.config({ path: envPath });
envLoaded = true;
break;
}
}
if (!envLoaded) {
console.warn('Warning: No .env file found. Make sure to provide environment variables.');
}
// --- Shared Constants ---
const COMMON_ANALYSIS_PROMPT = `Analyze this image comprehensively. Based on its content, provide the following:
1. A descriptive title (max 10 words).
2. A short description (2-3 sentences).
3. A comprehensive summary.
4. A comprehensive fulltext detailing all visible elements, context, and any inferred information (as much detail as possible).
5. A list of possible entities (like named entity recognition - people, places, organizations, objects).
6. A possible document type (e.g., invoice, contract, photo, illustration, diagram, receipt, business card, etc.).
Please format your response as a JSON object with the following keys: 'title', 'description', 'summary', 'fulltext', 'entities' (as a list of strings), and 'document_type'.`;
// --- Helper Functions ---
/**
* Loads, resizes (max 1024px), and encodes an image file to base64 string
* @param {string} imagePath - Path to the image file
* @returns {Promise<string|null>} Base64 encoded string or null on error
*/
async function encodeImageToBase64Resized(imagePath) {
try {
const image = sharp(imagePath);
const metadata = await image.metadata();
const maxDim = 1024;
let resized = image;
if (metadata.width > maxDim || metadata.height > maxDim) {
resized = image.resize({
width: metadata.width > metadata.height ? maxDim : undefined,
height: metadata.height >= metadata.width ? maxDim : undefined,
fit: 'inside',
withoutEnlargement: true
});
}
// Always output as JPEG for best compatibility and compression
const buffer = await resized.jpeg({ quality: 80 }).toBuffer();
return buffer.toString('base64');
} catch (error) {
console.error(`Error encoding/resizing image ${imagePath}: ${error.message}`);
return null;
}
}
/**
* Analyze all images in a folder using the selected service
* @param {string} folderPath - Path to the folder containing images
* @param {string} service - Service to use ('google', 'openai', 'anthropic')
* @param {string} outputDir - Directory to save results
*/
async function analyzeImagesInFolder(folderPath, service, outputDir) {
const supportedExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp'];
const files = fs.readdirSync(folderPath);
for (const file of files) {
const ext = path.extname(file).toLowerCase();
if (!supportedExtensions.includes(ext)) continue;
const imagePath = path.join(folderPath, file);
console.log(`\n---\nAnalyzing: ${imagePath}`);
let analysisResult;
switch (service) {
case 'google':
analysisResult = await analyzeWithGoogle(imagePath);
break;
case 'openai':
analysisResult = await analyzeWithOpenAiAzure(imagePath);
break;
case 'anthropic':
analysisResult = await analyzeWithAnthropic(imagePath);
break;
default:
console.error(`Error: Unknown service '${service}'`);
continue;
}
if (analysisResult && Object.keys(analysisResult).length > 0) {
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
const baseFilename = path.basename(imagePath, path.extname(imagePath));
const outputFilename = path.join(outputDir, `${baseFilename}_${service}_analysis.json`);
try {
fs.writeFileSync(outputFilename, JSON.stringify(analysisResult, null, 2));
console.log(`Analysis saved to: ${outputFilename}`);
} catch (error) {
console.error(`Error saving analysis to JSON: ${error.message}`);
}
} else {
console.log("No analysis result generated or an error occurred.");
}
}
}
/**
* Determines the media type of an image based on its extension
* @param {string} imagePath - Path to the image file
* @returns {string} MIME type of the image
*/
function getMediaType(imagePath) {
const extension = path.extname(imagePath).toLowerCase();
switch (extension) {
case '.png':
return 'image/png';
case '.jpg':
case '.jpeg':
return 'image/jpeg';
case '.gif':
return 'image/gif';
case '.webp':
return 'image/webp';
default:
console.warn(`Warning: Unknown image type for ${imagePath}, defaulting to image/jpeg. Consider adding more specific handling.`);
return 'image/jpeg';
}
}
/**
* Parses JSON response text and handles potential errors
* @param {string} responseText - Response text to parse as JSON
* @param {string} serviceName - Name of the service for error reporting
* @returns {object} Parsed JSON object or error object
*/
function parseJsonResponse(responseText, serviceName) {
try {
console.error(`Raw response text: ${responseText}`);
// Remove code block markers if present
let cleaned = responseText.trim();
if (cleaned.startsWith('```')) {
cleaned = cleaned.replace(/^```[a-zA-Z]*\n?/, '').replace(/```$/, '').trim();
}
return JSON.parse(cleaned);
} catch (error) {
console.error(`Error decoding JSON from ${serviceName} response: ${error.message}`);
console.error(`Raw response text: ${responseText}`);
return {
title: `Error: Could not parse JSON response from ${serviceName}`,
description: responseText,
fulltext: '',
entities: [],
document_type: 'Error'
};
}
}
/**
* Validates that an image can be opened
* @param {string} imagePath - Path to the image file
* @returns {Promise<boolean>} True if valid, false otherwise
*/
async function validateImage(imagePath) {
try {
await sharp(imagePath).metadata();
console.log(`Successfully validated image: ${imagePath}`);
return true;
} catch (error) {
console.error(`Error: Cannot open or read image file ${imagePath}: ${error.message}`);
return false;
}
}
// --- AI Service Interaction Functions ---
/**
* Analyzes the image using Google Gemini Pro Vision
* @param {string} imagePath - Path to the image file
* @returns {Promise<object>} Analysis result
*/
async function analyzeWithGoogle(imagePath) {
console.log(`Analyzing ${imagePath} with Google Gemini Pro Vision...`);
const apiKey = process.env.GOOGLE_API_KEY;
if (!apiKey) {
console.error("Error: GOOGLE_API_KEY not found in environment variables.");
return {};
}
try {
// Validate API key
if (!apiKey || apiKey.trim() === '') {
console.error("Error: GOOGLE_API_KEY is empty or invalid.");
return {};
}
// Initialize the new Google GenAI SDK
const ai = new GoogleGenAI({ apiKey });
const modelName = 'gemini-2.5-flash-preview-05-20';//'gemini-2.0-flash';
console.log(`Using Google Gemini model: ${modelName}`);
// Resize and encode image to base64 (JPEG, max 1024px)
const base64Image = await encodeImageToBase64Resized(imagePath);
if (!base64Image) {
return {};
}
const mediaType = 'image/jpeg';
// Use the new SDK's generateContent method
const result = await ai.models.generateContent({
model: modelName,
contents: [
{
role: 'user',
parts: [
{ text: COMMON_ANALYSIS_PROMPT },
{
inlineData: {
data: base64Image,
mimeType: mediaType
}
}
]
}
]
});
// In the new SDK, result.text is a property, not a function
const responseText = result.text;
if (responseText) {
try {
return parseJsonResponse(responseText, "Google Gemini");
} catch (parseError) {
console.error(`Error processing Gemini response: ${parseError.message}`);
console.error(`Raw response text: ${responseText}`);
return {
title: "Error parsing Gemini response",
description: "The response from Gemini could not be parsed as valid JSON.",
fulltext: responseText.substring(0, 200) + (responseText.length > 200 ? '...' : ''),
entities: [],
document_type: "Error"
};
}
} else {
console.error("Error: No valid text content in response from Google Gemini.");
return {};
}
} catch (error) {
console.error(`An unexpected error occurred in analyzeWithGoogle (Gemini): ${error.message}`);
// Handle specific model-related errors
if (error.message && error.message.includes('model')) {
console.error("This appears to be a model availability error.");
console.error("Please check if the model name is correct and available in your region.");
console.error("Available Gemini models include: gemini-1.5-pro, gemini-1.0-pro");
}
// Handle API key issues
if (error.message && error.message.includes('API key')) {
console.error("Please check your GOOGLE_API_KEY environment variable.");
}
return {};
}
}
/**
* Analyzes the image using OpenAI API on Azure
* @param {string} imagePath - Path to the image file
* @returns {Promise<object>} Analysis result
*/
async function analyzeWithOpenAiAzure(imagePath) {
console.log(`Analyzing ${imagePath} with OpenAI on Azure...`);
const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
const apiKey = process.env.AZURE_OPENAI_KEY;
const deploymentName = process.env.AZURE_OPENAI_DEPLOYMENT_NAME;
if (!azureEndpoint || !apiKey || !deploymentName) {
console.error("Error: Azure OpenAI environment variables (AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_KEY, AZURE_OPENAI_DEPLOYMENT_NAME) not fully set.");
return {};
}
try {
const client = new OpenAI({
azure: {
apiKey,
endpoint: azureEndpoint,
deploymentName,
},
apiVersion: '2024-02-01',
});
const base64Image = await encodeImageToBase64Resized(imagePath);
if (!base64Image) {
return {};
}
const mediaType = 'image/jpeg';
const response = await client.chat.completions.create({
model: deploymentName,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: COMMON_ANALYSIS_PROMPT },
{
type: 'image_url',
image_url: { url: `data:${mediaType};base64,${base64Image}` }
}
]
}
],
max_tokens: 2048
});
if (response.choices &&
response.choices[0] &&
response.choices[0].message &&
response.choices[0].message.content) {
const responseText = response.choices[0].message.content;
return parseJsonResponse(responseText, "OpenAI");
} else {
console.error("Error: Unexpected response structure from OpenAI.");
console.error(`Raw response: ${JSON.stringify(response)}`);
return {};
}
} catch (error) {
console.error(`An unexpected error occurred in analyzeWithOpenAiAzure: ${error.message}`);
if (error.status) {
console.error(`Status code: ${error.status}`);
}
if (error.response) {
console.error(`Response: ${JSON.stringify(error.response)}`);
}
return {};
}
}
/**
* Analyzes the image using Anthropic Claude API
* @param {string} imagePath - Path to the image file
* @returns {Promise<object>} Analysis result
*/
async function analyzeWithAnthropic(imagePath) {
console.log(`Analyzing ${imagePath} with Anthropic Claude...`);
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
console.error("Error: ANTHROPIC_API_KEY not found in environment variables.");
return {};
}
try {
const client = new Anthropic({ apiKey });
const base64Image = await encodeImageToBase64Resized(imagePath);
if (!base64Image) {
return {};
}
const mediaType = 'image/jpeg';
const message = await client.messages.create({
model: "claude-3-5-sonnet-20240620",
max_tokens: 2048,
messages: [
{
role: "user",
content: [
{
type: "image",
source: {
type: "base64",
media_type: mediaType,
data: base64Image
}
},
{ type: "text", text: COMMON_ANALYSIS_PROMPT }
]
}
]
});
if (message.content &&
Array.isArray(message.content) &&
message.content[0] &&
message.content[0].text) {
const responseText = message.content[0].text;
return parseJsonResponse(responseText, "Anthropic");
} else {
console.error("Error: Unexpected response structure from Anthropic.");
console.error(`Raw response: ${JSON.stringify(message)}`);
return {};
}
} catch (error) {
console.error(`An unexpected error occurred in analyzeWithAnthropic: ${error.message}`);
if (error.status) {
console.error(`Status code: ${error.status}`);
}
return {};
}
}
/**
* Main function to run the image analysis
*/
async function main() {
// Parse command line arguments
const argv = yargs(hideBin(process.argv))
.usage('Usage: $0 <image_path|folder_path> --service [service] [options]')
.option('service', {
alias: 's',
describe: 'AI service to use for analysis',
choices: ['google', 'openai', 'anthropic'],
demandOption: true
})
.option('output-dir', {
alias: 'o',
describe: 'Directory to save the analysis results',
default: 'output'
})
.option('folder', {
alias: 'f',
type: 'boolean',
describe: 'If set, treat the path as a folder and analyze all images inside'
})
.demandCommand(1, 'Please provide the path to an image file or folder')
.help()
.argv;
const inputPath = argv._[0];
const service = argv.service;
const outputDir = argv['output-dir'];
const isFolder = argv.folder;
if (isFolder) {
if (!fs.existsSync(inputPath) || !fs.statSync(inputPath).isDirectory()) {
console.error(`Error: Folder not found at ${inputPath}`);
return;
}
await analyzeImagesInFolder(inputPath, service, outputDir);
return;
}
// Single image mode (default)
if (!fs.existsSync(inputPath)) {
console.error(`Error: Image file not found at ${inputPath}`);
return;
}
const isValid = await validateImage(inputPath);
if (!isValid) {
return;
}
let analysisResult;
switch (service) {
case 'google':
analysisResult = await analyzeWithGoogle(inputPath);
break;
case 'openai':
analysisResult = await analyzeWithOpenAiAzure(inputPath);
break;
case 'anthropic':
analysisResult = await analyzeWithAnthropic(inputPath);
break;
default:
console.error(`Error: Unknown service '${service}'`);
return;
}
if (analysisResult && Object.keys(analysisResult).length > 0) {
console.log("--- Analysis Result ---");
console.log(`Title: ${analysisResult.title || 'N/A'}`);
console.log(`Description: ${analysisResult.description || 'N/A'}`);
console.log(`Summary: ${analysisResult.summary || 'N/A'}`);
console.log(`Fulltext: ${analysisResult.fulltext || 'N/A'}`);
console.log(`Entities: ${JSON.stringify(analysisResult.entities || [])}`);
console.log(`Document Type: ${analysisResult.document_type || 'N/A'}`);
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
const baseFilename = path.basename(inputPath, path.extname(inputPath));
const outputFilename = path.join(outputDir, `${baseFilename}_${service}_analysis.json`);
try {
fs.writeFileSync(outputFilename, JSON.stringify(analysisResult, null, 2));
console.log(`Analysis saved to: ${outputFilename}`);
} catch (error) {
console.error(`Error saving analysis to JSON: ${error.message}`);
}
} else {
console.log("No analysis result generated or an error occurred.");
}
}
// (end of file cleanup)
main().catch(error => {
console.error('Unexpected error:', error);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment