Skip to content

Instantly share code, notes, and snippets.

@andrewgcodes
Created January 20, 2025 19:40
Show Gist options
  • Save andrewgcodes/278769f94c0887180ed02335129d0c6d to your computer and use it in GitHub Desktop.
Save andrewgcodes/278769f94c0887180ed02335129d0c6d to your computer and use it in GitHub Desktop.
/**
* Configuration object for API settings.
* Contains key constants used throughout the extraction process.
*/
const API_CONFIG = {
key: "...", // Your Firecrawl API key
baseUrl: 'https://api.firecrawl.dev/v1', // Base URL for Firecrawl API endpoints
timeout: 30, // Timeout for HTTP requests in seconds
maxAttempts: 10, // Maximum number of attempts to poll for job completion
initialDelay: 1000 // Initial delay (in ms) before polling for job status
};
/**
* Main extraction function exposed as a custom function in Google Sheets.
* It takes a URL/domain and a prompt, then coordinates the entire extraction process using Firecrawl API.
*
* @param {string} input - The URL or domain to extract data from.
* @param {string} prompt - The extraction prompt/query for the LLM.
* @returns {string} - Extracted data as a formatted string or an error message.
*/
function EXTRACT(input, prompt) {
// Verify that an API key is set in the configuration
if (!API_CONFIG.key) {
return "Error: API key not found. Please set your Firecrawl API key in the script properties.";
}
// Process and validate the input URL/domain using a helper function.
const url = processInput(input);
if (!url) {
return "Invalid URL format"; // Return error if URL is not valid.
}
try {
// Initiate the extraction process and obtain a job ID.
const extractionId = initiateExtraction(url, prompt);
if (!extractionId) {
return "Error: Failed to initiate extraction";
}
// Monitor the extraction process until it's completed, then return the result.
return monitorExtraction(extractionId);
} catch (e) {
// If any error occurs, handle it gracefully and return an error message.
return handleError(e);
}
}
/**
* Initiates the extraction process by sending a request to the Firecrawl API.
*
* @param {string} url - The processed URL to extract data from.
* @param {string} prompt - The extraction prompt describing what data to retrieve.
* @returns {string|null} - Returns the extraction job ID if successful, otherwise null.
*/
function initiateExtraction(url, prompt) {
// Construct the API endpoint URL for extraction.
const endpoint = `${API_CONFIG.baseUrl}/extract`;
// Prepare the payload with instructions for the LLM.
const payload = {
"urls": [url],
"prompt":
"Given the website content, fulfill the user's request. " +
"It may be a direct instruction or a question about the contents of the website. " +
"Here is the user's request: " + prompt +
" Provide your concise response in a JSON. " +
"Importantly, your JSON must only contain one key ('response') and value. " +
"If you can't fulfill the request given the provided website context, " +
"your JSON should be key ('response') and value ('Not found')"
};
// Set up the options for the HTTP POST request.
const options = {
'method': 'post',
'contentType': 'application/json',
'headers': {
'Authorization': 'Bearer ' + API_CONFIG.key // Authenticate using API key
},
'payload': JSON.stringify(payload),
'muteHttpExceptions': true, // So we can handle HTTP errors manually
'timeout': API_CONFIG.timeout // Set the request timeout
};
try {
// Send the POST request to initiate the extraction.
const response = UrlFetchApp.fetch(endpoint, options);
const responseCode = response.getResponseCode();
// If the response is successful (HTTP 2xx), parse the response.
if (responseCode >= 200 && responseCode < 300) {
const responseData = JSON.parse(response.getContentText());
// Return the job ID if the extraction was successfully initiated.
return responseData.success ? responseData.id : null;
}
// If the response code is not successful, return null.
return null;
} catch (e) {
// If there's an exception while making the request, throw an error with details.
throw new Error(`Failed to initiate extraction: ${e.message}`);
}
}
/**
* Polls the Firecrawl API to monitor the status of an extraction job until completion.
* Utilizes exponential backoff strategy between polling attempts.
*
* @param {string} extractionId - The ID of the initiated extraction job.
* @returns {string} - The formatted extracted data or an error message.
*/
function monitorExtraction(extractionId) {
let currentAttempt = 0;
// Loop until the maximum number of attempts is reached.
while (currentAttempt < API_CONFIG.maxAttempts) {
// Calculate the delay before the next status check, using exponential backoff.
const delay = API_CONFIG.initialDelay * Math.pow(2, currentAttempt);
Utilities.sleep(delay);
// Check the current status of the extraction job.
const status = checkExtractionStatus(extractionId);
// If there is an error reported in the status response, return it.
if (status.error) {
return `Error: ${status.error}`;
}
// Handle different statuses of the extraction job.
switch (status.status) {
case 'completed':
// If completed, format and return the extracted data.
return status.data ? formatData(status.data) : "Error: No data in completed response";
case 'failed':
return "Error: Extraction failed";
case 'cancelled':
return "Error: Extraction was cancelled";
case 'pending':
case 'processing':
// Job is still in progress, continue polling.
break;
default:
// For any unknown status, return an error.
return `Error: Unknown status - ${status.status}`;
}
// Increment the attempt counter before the next loop iteration.
currentAttempt++;
}
// If the loop finishes without completing the extraction, return a timeout error.
return "Error: Extraction timed out after maximum attempts";
}
/**
* Checks the status of an extraction job by querying the Firecrawl API.
*
* @param {string} extractionId - The ID of the extraction job to check.
* @returns {Object} - An object containing job status, data, and any error message.
*/
function checkExtractionStatus(extractionId) {
// Construct the status endpoint URL using the extraction ID.
const endpoint = `${API_CONFIG.baseUrl}/extract/${extractionId}`;
const options = {
'method': 'get',
'headers': {
'Authorization': 'Bearer ' + API_CONFIG.key // Use the API key for authentication
},
'muteHttpExceptions': true, // Prevent exceptions on HTTP error codes
'timeout': API_CONFIG.timeout // Set request timeout
};
try {
// Send GET request to check job status.
const response = UrlFetchApp.fetch(endpoint, options);
const responseCode = response.getResponseCode();
// If response is successful, parse and return status and data.
if (responseCode >= 200 && responseCode < 300) {
const responseData = JSON.parse(response.getContentText());
return {
status: responseData.status, // Job status: completed, pending, failed, etc.
data: responseData.data, // Extracted data if available
error: null
};
}
// If HTTP error, return an error object with details.
return {
status: 'error',
data: null,
error: `HTTP Error: ${responseCode} - ${response.getContentText()}`
};
} catch (e) {
// If exception occurs, return it as part of the status object.
return {
status: 'error',
data: null,
error: e.message
};
}
}
/**
* Recursively formats the extracted data into a flat, comma-separated string.
* Handles various data types including objects, arrays, and primitives.
*
* @param {any} data - The data to format (could be an object, array, or primitive).
* @param {string} indent - Optional indentation for nested structures (not used in current logic).
* @returns {string} - A formatted string representation of the data.
*/
function formatData(data, indent = '') {
// If the data is null or undefined, return "Not found".
if (data === null || data === undefined) {
return "Not found";
}
// If the data is a simple type (string, number, etc.), convert it to string.
if (typeof data !== 'object') {
return String(data);
}
// If the data is an array, recursively format each item and join with commas.
if (Array.isArray(data)) {
return data.map(item => formatData(item)).join(', ');
}
// If the data is an object, iterate its properties and format each value.
if (typeof data === 'object') {
const result = [];
for (const key in data) {
// Only process own properties, not inherited ones.
if (data.hasOwnProperty(key)) {
result.push(formatData(data[key]));
}
}
// Join all formatted values with commas.
return result.join(', ');
}
// Fallback: convert whatever remains to a string.
return String(data);
}
/**
* Processes and validates input URLs.
* Cleans up the input, removes unnecessary parts, and ensures it's a valid domain.
*
* @param {string} input - Raw input which might be a URL, domain, or even an email.
* @returns {string|null} - A sanitized URL starting with 'https://' or null if invalid.
*/
function processInput(input) {
// Check that input is a non-empty string.
if (!input || typeof input !== 'string') {
return null;
}
// Trim whitespace from both ends of the input.
input = input.trim();
// If input contains an '@', assume it's an email and extract domain part.
if (input.includes('@')) {
input = input.split('@')[1];
}
// Remove common URL prefixes such as "http://", "https://", "www."
input = input.replace(/^(https?:\/\/)?(www\.)?/, '');
// Validate that the remaining input looks like a valid domain.
const domainRegex = /^[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$/;
if (!domainRegex.test(input)) {
return null;
}
// Prepend "https://" to form a proper URL and return it.
return 'https://' + input;
}
/**
* Centralized error handling that formats exceptions into user-friendly messages.
*
* @param {Error} error - The error object caught during execution.
* @returns {string} - A formatted error message suitable for display.
*/
function handleError(error) {
// Check for common timeout-related error messages and return a specific message.
if (error.message.includes('timeout') || error.message.includes('Exceeded maximum execution time')) {
return "Error: Request timed out";
}
// For other errors, prefix with "Exception:" to indicate an unexpected issue.
return "Exception: " + error.message;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment