Created
January 20, 2025 19:40
-
-
Save andrewgcodes/278769f94c0887180ed02335129d0c6d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Configuration object for API settings. | |
* Contains key constants used throughout the extraction process. | |
*/ | |
const API_CONFIG = { | |
key: "...", // Your Firecrawl API key | |
baseUrl: 'https://api.firecrawl.dev/v1', // Base URL for Firecrawl API endpoints | |
timeout: 30, // Timeout for HTTP requests in seconds | |
maxAttempts: 10, // Maximum number of attempts to poll for job completion | |
initialDelay: 1000 // Initial delay (in ms) before polling for job status | |
}; | |
/** | |
* Main extraction function exposed as a custom function in Google Sheets. | |
* It takes a URL/domain and a prompt, then coordinates the entire extraction process using Firecrawl API. | |
* | |
* @param {string} input - The URL or domain to extract data from. | |
* @param {string} prompt - The extraction prompt/query for the LLM. | |
* @returns {string} - Extracted data as a formatted string or an error message. | |
*/ | |
function EXTRACT(input, prompt) { | |
// Verify that an API key is set in the configuration | |
if (!API_CONFIG.key) { | |
return "Error: API key not found. Please set your Firecrawl API key in the script properties."; | |
} | |
// Process and validate the input URL/domain using a helper function. | |
const url = processInput(input); | |
if (!url) { | |
return "Invalid URL format"; // Return error if URL is not valid. | |
} | |
try { | |
// Initiate the extraction process and obtain a job ID. | |
const extractionId = initiateExtraction(url, prompt); | |
if (!extractionId) { | |
return "Error: Failed to initiate extraction"; | |
} | |
// Monitor the extraction process until it's completed, then return the result. | |
return monitorExtraction(extractionId); | |
} catch (e) { | |
// If any error occurs, handle it gracefully and return an error message. | |
return handleError(e); | |
} | |
} | |
/** | |
* Initiates the extraction process by sending a request to the Firecrawl API. | |
* | |
* @param {string} url - The processed URL to extract data from. | |
* @param {string} prompt - The extraction prompt describing what data to retrieve. | |
* @returns {string|null} - Returns the extraction job ID if successful, otherwise null. | |
*/ | |
function initiateExtraction(url, prompt) { | |
// Construct the API endpoint URL for extraction. | |
const endpoint = `${API_CONFIG.baseUrl}/extract`; | |
// Prepare the payload with instructions for the LLM. | |
const payload = { | |
"urls": [url], | |
"prompt": | |
"Given the website content, fulfill the user's request. " + | |
"It may be a direct instruction or a question about the contents of the website. " + | |
"Here is the user's request: " + prompt + | |
" Provide your concise response in a JSON. " + | |
"Importantly, your JSON must only contain one key ('response') and value. " + | |
"If you can't fulfill the request given the provided website context, " + | |
"your JSON should be key ('response') and value ('Not found')" | |
}; | |
// Set up the options for the HTTP POST request. | |
const options = { | |
'method': 'post', | |
'contentType': 'application/json', | |
'headers': { | |
'Authorization': 'Bearer ' + API_CONFIG.key // Authenticate using API key | |
}, | |
'payload': JSON.stringify(payload), | |
'muteHttpExceptions': true, // So we can handle HTTP errors manually | |
'timeout': API_CONFIG.timeout // Set the request timeout | |
}; | |
try { | |
// Send the POST request to initiate the extraction. | |
const response = UrlFetchApp.fetch(endpoint, options); | |
const responseCode = response.getResponseCode(); | |
// If the response is successful (HTTP 2xx), parse the response. | |
if (responseCode >= 200 && responseCode < 300) { | |
const responseData = JSON.parse(response.getContentText()); | |
// Return the job ID if the extraction was successfully initiated. | |
return responseData.success ? responseData.id : null; | |
} | |
// If the response code is not successful, return null. | |
return null; | |
} catch (e) { | |
// If there's an exception while making the request, throw an error with details. | |
throw new Error(`Failed to initiate extraction: ${e.message}`); | |
} | |
} | |
/** | |
* Polls the Firecrawl API to monitor the status of an extraction job until completion. | |
* Utilizes exponential backoff strategy between polling attempts. | |
* | |
* @param {string} extractionId - The ID of the initiated extraction job. | |
* @returns {string} - The formatted extracted data or an error message. | |
*/ | |
function monitorExtraction(extractionId) { | |
let currentAttempt = 0; | |
// Loop until the maximum number of attempts is reached. | |
while (currentAttempt < API_CONFIG.maxAttempts) { | |
// Calculate the delay before the next status check, using exponential backoff. | |
const delay = API_CONFIG.initialDelay * Math.pow(2, currentAttempt); | |
Utilities.sleep(delay); | |
// Check the current status of the extraction job. | |
const status = checkExtractionStatus(extractionId); | |
// If there is an error reported in the status response, return it. | |
if (status.error) { | |
return `Error: ${status.error}`; | |
} | |
// Handle different statuses of the extraction job. | |
switch (status.status) { | |
case 'completed': | |
// If completed, format and return the extracted data. | |
return status.data ? formatData(status.data) : "Error: No data in completed response"; | |
case 'failed': | |
return "Error: Extraction failed"; | |
case 'cancelled': | |
return "Error: Extraction was cancelled"; | |
case 'pending': | |
case 'processing': | |
// Job is still in progress, continue polling. | |
break; | |
default: | |
// For any unknown status, return an error. | |
return `Error: Unknown status - ${status.status}`; | |
} | |
// Increment the attempt counter before the next loop iteration. | |
currentAttempt++; | |
} | |
// If the loop finishes without completing the extraction, return a timeout error. | |
return "Error: Extraction timed out after maximum attempts"; | |
} | |
/** | |
* Checks the status of an extraction job by querying the Firecrawl API. | |
* | |
* @param {string} extractionId - The ID of the extraction job to check. | |
* @returns {Object} - An object containing job status, data, and any error message. | |
*/ | |
function checkExtractionStatus(extractionId) { | |
// Construct the status endpoint URL using the extraction ID. | |
const endpoint = `${API_CONFIG.baseUrl}/extract/${extractionId}`; | |
const options = { | |
'method': 'get', | |
'headers': { | |
'Authorization': 'Bearer ' + API_CONFIG.key // Use the API key for authentication | |
}, | |
'muteHttpExceptions': true, // Prevent exceptions on HTTP error codes | |
'timeout': API_CONFIG.timeout // Set request timeout | |
}; | |
try { | |
// Send GET request to check job status. | |
const response = UrlFetchApp.fetch(endpoint, options); | |
const responseCode = response.getResponseCode(); | |
// If response is successful, parse and return status and data. | |
if (responseCode >= 200 && responseCode < 300) { | |
const responseData = JSON.parse(response.getContentText()); | |
return { | |
status: responseData.status, // Job status: completed, pending, failed, etc. | |
data: responseData.data, // Extracted data if available | |
error: null | |
}; | |
} | |
// If HTTP error, return an error object with details. | |
return { | |
status: 'error', | |
data: null, | |
error: `HTTP Error: ${responseCode} - ${response.getContentText()}` | |
}; | |
} catch (e) { | |
// If exception occurs, return it as part of the status object. | |
return { | |
status: 'error', | |
data: null, | |
error: e.message | |
}; | |
} | |
} | |
/** | |
* Recursively formats the extracted data into a flat, comma-separated string. | |
* Handles various data types including objects, arrays, and primitives. | |
* | |
* @param {any} data - The data to format (could be an object, array, or primitive). | |
* @param {string} indent - Optional indentation for nested structures (not used in current logic). | |
* @returns {string} - A formatted string representation of the data. | |
*/ | |
function formatData(data, indent = '') { | |
// If the data is null or undefined, return "Not found". | |
if (data === null || data === undefined) { | |
return "Not found"; | |
} | |
// If the data is a simple type (string, number, etc.), convert it to string. | |
if (typeof data !== 'object') { | |
return String(data); | |
} | |
// If the data is an array, recursively format each item and join with commas. | |
if (Array.isArray(data)) { | |
return data.map(item => formatData(item)).join(', '); | |
} | |
// If the data is an object, iterate its properties and format each value. | |
if (typeof data === 'object') { | |
const result = []; | |
for (const key in data) { | |
// Only process own properties, not inherited ones. | |
if (data.hasOwnProperty(key)) { | |
result.push(formatData(data[key])); | |
} | |
} | |
// Join all formatted values with commas. | |
return result.join(', '); | |
} | |
// Fallback: convert whatever remains to a string. | |
return String(data); | |
} | |
/** | |
* Processes and validates input URLs. | |
* Cleans up the input, removes unnecessary parts, and ensures it's a valid domain. | |
* | |
* @param {string} input - Raw input which might be a URL, domain, or even an email. | |
* @returns {string|null} - A sanitized URL starting with 'https://' or null if invalid. | |
*/ | |
function processInput(input) { | |
// Check that input is a non-empty string. | |
if (!input || typeof input !== 'string') { | |
return null; | |
} | |
// Trim whitespace from both ends of the input. | |
input = input.trim(); | |
// If input contains an '@', assume it's an email and extract domain part. | |
if (input.includes('@')) { | |
input = input.split('@')[1]; | |
} | |
// Remove common URL prefixes such as "http://", "https://", "www." | |
input = input.replace(/^(https?:\/\/)?(www\.)?/, ''); | |
// Validate that the remaining input looks like a valid domain. | |
const domainRegex = /^[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$/; | |
if (!domainRegex.test(input)) { | |
return null; | |
} | |
// Prepend "https://" to form a proper URL and return it. | |
return 'https://' + input; | |
} | |
/** | |
* Centralized error handling that formats exceptions into user-friendly messages. | |
* | |
* @param {Error} error - The error object caught during execution. | |
* @returns {string} - A formatted error message suitable for display. | |
*/ | |
function handleError(error) { | |
// Check for common timeout-related error messages and return a specific message. | |
if (error.message.includes('timeout') || error.message.includes('Exceeded maximum execution time')) { | |
return "Error: Request timed out"; | |
} | |
// For other errors, prefix with "Exception:" to indicate an unexpected issue. | |
return "Exception: " + error.message; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment