andrewgcodes · January 20, 2025 19:40
diff --git a/firecrawl_extract_sheets.gs b/firecrawl_extract_sheets.gs
 /**
 * Configuration object for API settings.
 * Contains key constants used throughout the extraction process.
 */
 const API_CONFIG = {
  key: "...",       // Your Firecrawl API key
  baseUrl: 'https://api.firecrawl.dev/v1',           // Base URL for Firecrawl API endpoints
  timeout: 30,                                       // Timeout for HTTP requests in seconds
  maxAttempts: 10,                                   // Maximum number of attempts to poll for job completion
  initialDelay: 1000                                 // Initial delay (in ms) before polling for job status
 };

 /**
 * Main extraction function exposed as a custom function in Google Sheets.
 * It takes a URL/domain and a prompt, then coordinates the entire extraction process using Firecrawl API.
 * 
 * @param {string} input - The URL or domain to extract data from.
 * @param {string} prompt - The extraction prompt/query for the LLM.
 * @returns {string} - Extracted data as a formatted string or an error message.
 */
 function EXTRACT(input, prompt) {
  // Verify that an API key is set in the configuration
  if (!API_CONFIG.key) {
    return "Error: API key not found. Please set your Firecrawl API key in the script properties.";
  }

  // Process and validate the input URL/domain using a helper function.
  const url = processInput(input);
  if (!url) {
    return "Invalid URL format";  // Return error if URL is not valid.
  }

  try {
    // Initiate the extraction process and obtain a job ID.
    const extractionId = initiateExtraction(url, prompt);
    if (!extractionId) {
      return "Error: Failed to initiate extraction";
    }

    // Monitor the extraction process until it's completed, then return the result.
    return monitorExtraction(extractionId);
  } catch (e) {
    // If any error occurs, handle it gracefully and return an error message.
    return handleError(e);
  }
 }

 /**
 * Initiates the extraction process by sending a request to the Firecrawl API.
 *
 * @param {string} url - The processed URL to extract data from.
 * @param {string} prompt - The extraction prompt describing what data to retrieve.
 * @returns {string|null} - Returns the extraction job ID if successful, otherwise null.
 */
 function initiateExtraction(url, prompt) {
  // Construct the API endpoint URL for extraction.
  const endpoint = `${API_CONFIG.baseUrl}/extract`;

  // Prepare the payload with instructions for the LLM.
  const payload = {
    "urls": [url],
    "prompt": 
      "Given the website content, fulfill the user's request. " +
      "It may be a direct instruction or a question about the contents of the website. " +
      "Here is the user's request: " + prompt + 
      " Provide your concise response in a JSON. " +
      "Importantly, your JSON must only contain one key ('response') and value. " +
      "If you can't fulfill the request given the provided website context, " +
      "your JSON should be key ('response') and value ('Not found')"
  };

  // Set up the options for the HTTP POST request.
  const options = {
    'method': 'post',
    'contentType': 'application/json',
    'headers': {
      'Authorization': 'Bearer ' + API_CONFIG.key  // Authenticate using API key
    },
    'payload': JSON.stringify(payload),
    'muteHttpExceptions': true,  // So we can handle HTTP errors manually
    'timeout': API_CONFIG.timeout  // Set the request timeout
  };

  try {
    // Send the POST request to initiate the extraction.
    const response = UrlFetchApp.fetch(endpoint, options);
    const responseCode = response.getResponseCode();

    // If the response is successful (HTTP 2xx), parse the response.
    if (responseCode >= 200 && responseCode < 300) {
      const responseData = JSON.parse(response.getContentText());
      // Return the job ID if the extraction was successfully initiated.
      return responseData.success ? responseData.id : null;
    }
    
    // If the response code is not successful, return null.
    return null;
  } catch (e) {
    // If there's an exception while making the request, throw an error with details.
    throw new Error(`Failed to initiate extraction: ${e.message}`);
  }
 }

 /**
 * Polls the Firecrawl API to monitor the status of an extraction job until completion.
 * Utilizes exponential backoff strategy between polling attempts.
 *
 * @param {string} extractionId - The ID of the initiated extraction job.
 * @returns {string} - The formatted extracted data or an error message.
 */
 function monitorExtraction(extractionId) {
  let currentAttempt = 0;
  
  // Loop until the maximum number of attempts is reached.
  while (currentAttempt < API_CONFIG.maxAttempts) {
    // Calculate the delay before the next status check, using exponential backoff.
    const delay = API_CONFIG.initialDelay * Math.pow(2, currentAttempt);
    Utilities.sleep(delay);

    // Check the current status of the extraction job.
    const status = checkExtractionStatus(extractionId);
    
    // If there is an error reported in the status response, return it.
    if (status.error) {
      return `Error: ${status.error}`;
    }

    // Handle different statuses of the extraction job.
    switch (status.status) {
      case 'completed':
        // If completed, format and return the extracted data.
        return status.data ? formatData(status.data) : "Error: No data in completed response";
      case 'failed':
        return "Error: Extraction failed";
      case 'cancelled':
        return "Error: Extraction was cancelled";
      case 'pending':
      case 'processing':
        // Job is still in progress, continue polling.
        break;
      default:
        // For any unknown status, return an error.
        return `Error: Unknown status - ${status.status}`;
    }

    // Increment the attempt counter before the next loop iteration.
    currentAttempt++;
  }

  // If the loop finishes without completing the extraction, return a timeout error.
  return "Error: Extraction timed out after maximum attempts";
 }

 /**
 * Checks the status of an extraction job by querying the Firecrawl API.
 *
 * @param {string} extractionId - The ID of the extraction job to check.
 * @returns {Object} - An object containing job status, data, and any error message.
 */
 function checkExtractionStatus(extractionId) {
  // Construct the status endpoint URL using the extraction ID.
  const endpoint = `${API_CONFIG.baseUrl}/extract/${extractionId}`;
  const options = {
    'method': 'get',
    'headers': {
      'Authorization': 'Bearer ' + API_CONFIG.key  // Use the API key for authentication
    },
    'muteHttpExceptions': true,  // Prevent exceptions on HTTP error codes
    'timeout': API_CONFIG.timeout  // Set request timeout
  };

  try {
    // Send GET request to check job status.
    const response = UrlFetchApp.fetch(endpoint, options);
    const responseCode = response.getResponseCode();

    // If response is successful, parse and return status and data.
    if (responseCode >= 200 && responseCode < 300) {
      const responseData = JSON.parse(response.getContentText());
      return {
        status: responseData.status,  // Job status: completed, pending, failed, etc.
        data: responseData.data,      // Extracted data if available
        error: null
      };
    }

    // If HTTP error, return an error object with details.
    return {
      status: 'error',
      data: null,
      error: `HTTP Error: ${responseCode} - ${response.getContentText()}`
    };
  } catch (e) {
    // If exception occurs, return it as part of the status object.
    return {
      status: 'error',
      data: null,
      error: e.message
    };
  }
 }

 /**
 * Recursively formats the extracted data into a flat, comma-separated string.
 * Handles various data types including objects, arrays, and primitives.
 *
 * @param {any} data - The data to format (could be an object, array, or primitive).
 * @param {string} indent - Optional indentation for nested structures (not used in current logic).
 * @returns {string} - A formatted string representation of the data.
 */
 function formatData(data, indent = '') {
  // If the data is null or undefined, return "Not found".
  if (data === null || data === undefined) {
    return "Not found";
  }
  
  // If the data is a simple type (string, number, etc.), convert it to string.
  if (typeof data !== 'object') {
    return String(data);
  }
  
  // If the data is an array, recursively format each item and join with commas.
  if (Array.isArray(data)) {
    return data.map(item => formatData(item)).join(', ');
  }
  
  // If the data is an object, iterate its properties and format each value.
  if (typeof data === 'object') {
    const result = [];
    for (const key in data) {
      // Only process own properties, not inherited ones.
      if (data.hasOwnProperty(key)) {
        result.push(formatData(data[key]));
      }
    }
    // Join all formatted values with commas.
    return result.join(', ');
  }
  
  // Fallback: convert whatever remains to a string.
  return String(data);
 }

 /**
 * Processes and validates input URLs.
 * Cleans up the input, removes unnecessary parts, and ensures it's a valid domain.
 *
 * @param {string} input - Raw input which might be a URL, domain, or even an email.
 * @returns {string|null} - A sanitized URL starting with 'https://' or null if invalid.
 */
 function processInput(input) {
  // Check that input is a non-empty string.
  if (!input || typeof input !== 'string') {
    return null;
  }
  
  // Trim whitespace from both ends of the input.
  input = input.trim();
  
  // If input contains an '@', assume it's an email and extract domain part.
  if (input.includes('@')) {
    input = input.split('@')[1];
  }
  
  // Remove common URL prefixes such as "http://", "https://", "www."
  input = input.replace(/^(https?:\/\/)?(www\.)?/, '');
  
  // Validate that the remaining input looks like a valid domain.
  const domainRegex = /^[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$/;
  if (!domainRegex.test(input)) {
    return null;
  }
  
  // Prepend "https://" to form a proper URL and return it.
  return 'https://' + input;
 }

 /**
 * Centralized error handling that formats exceptions into user-friendly messages.
 *
 * @param {Error} error - The error object caught during execution.
 * @returns {string} - A formatted error message suitable for display.
 */
 function handleError(error) {
  // Check for common timeout-related error messages and return a specific message.
  if (error.message.includes('timeout') || error.message.includes('Exceeded maximum execution time')) {
    return "Error: Request timed out";
  }
  // For other errors, prefix with "Exception:" to indicate an unexpected issue.
  return "Exception: " + error.message;
 }
	/**
	* Configuration object for API settings.
	* Contains key constants used throughout the extraction process.
	*/
	const API_CONFIG = {
	key: "...", // Your Firecrawl API key
	baseUrl: 'https://api.firecrawl.dev/v1', // Base URL for Firecrawl API endpoints
	timeout: 30, // Timeout for HTTP requests in seconds
	maxAttempts: 10, // Maximum number of attempts to poll for job completion
	initialDelay: 1000 // Initial delay (in ms) before polling for job status
	};

	/**
	* Main extraction function exposed as a custom function in Google Sheets.
	* It takes a URL/domain and a prompt, then coordinates the entire extraction process using Firecrawl API.
	*
	* @param {string} input - The URL or domain to extract data from.
	* @param {string} prompt - The extraction prompt/query for the LLM.
	* @returns {string} - Extracted data as a formatted string or an error message.
	*/
	function EXTRACT(input, prompt) {
	// Verify that an API key is set in the configuration
	if (!API_CONFIG.key) {
	return "Error: API key not found. Please set your Firecrawl API key in the script properties.";
	}

	// Process and validate the input URL/domain using a helper function.
	const url = processInput(input);
	if (!url) {
	return "Invalid URL format"; // Return error if URL is not valid.
	}

	try {
	// Initiate the extraction process and obtain a job ID.
	const extractionId = initiateExtraction(url, prompt);
	if (!extractionId) {
	return "Error: Failed to initiate extraction";
	}

	// Monitor the extraction process until it's completed, then return the result.
	return monitorExtraction(extractionId);
	} catch (e) {
	// If any error occurs, handle it gracefully and return an error message.
	return handleError(e);
	}
	}

	/**
	* Initiates the extraction process by sending a request to the Firecrawl API.
	*
	* @param {string} url - The processed URL to extract data from.
	* @param {string} prompt - The extraction prompt describing what data to retrieve.
	* @returns {string\|null} - Returns the extraction job ID if successful, otherwise null.
	*/
	function initiateExtraction(url, prompt) {
	// Construct the API endpoint URL for extraction.
	const endpoint = `${API_CONFIG.baseUrl}/extract`;

	// Prepare the payload with instructions for the LLM.
	const payload = {
	"urls": [url],
	"prompt":
	"Given the website content, fulfill the user's request. " +
	"It may be a direct instruction or a question about the contents of the website. " +
	"Here is the user's request: " + prompt +
	" Provide your concise response in a JSON. " +
	"Importantly, your JSON must only contain one key ('response') and value. " +
	"If you can't fulfill the request given the provided website context, " +
	"your JSON should be key ('response') and value ('Not found')"
	};

	// Set up the options for the HTTP POST request.
	const options = {
	'method': 'post',
	'contentType': 'application/json',
	'headers': {
	'Authorization': 'Bearer ' + API_CONFIG.key // Authenticate using API key
	},
	'payload': JSON.stringify(payload),
	'muteHttpExceptions': true, // So we can handle HTTP errors manually
	'timeout': API_CONFIG.timeout // Set the request timeout
	};

	try {
	// Send the POST request to initiate the extraction.
	const response = UrlFetchApp.fetch(endpoint, options);
	const responseCode = response.getResponseCode();

	// If the response is successful (HTTP 2xx), parse the response.
	if (responseCode >= 200 && responseCode < 300) {
	const responseData = JSON.parse(response.getContentText());
	// Return the job ID if the extraction was successfully initiated.
	return responseData.success ? responseData.id : null;
	}

	// If the response code is not successful, return null.
	return null;
	} catch (e) {
	// If there's an exception while making the request, throw an error with details.
	throw new Error(`Failed to initiate extraction: ${e.message}`);
	}
	}

	/**
	* Polls the Firecrawl API to monitor the status of an extraction job until completion.
	* Utilizes exponential backoff strategy between polling attempts.
	*
	* @param {string} extractionId - The ID of the initiated extraction job.
	* @returns {string} - The formatted extracted data or an error message.
	*/
	function monitorExtraction(extractionId) {
	let currentAttempt = 0;

	// Loop until the maximum number of attempts is reached.
	while (currentAttempt < API_CONFIG.maxAttempts) {
	// Calculate the delay before the next status check, using exponential backoff.
	const delay = API_CONFIG.initialDelay * Math.pow(2, currentAttempt);
	Utilities.sleep(delay);

	// Check the current status of the extraction job.
	const status = checkExtractionStatus(extractionId);

	// If there is an error reported in the status response, return it.
	if (status.error) {
	return `Error: ${status.error}`;
	}

	// Handle different statuses of the extraction job.
	switch (status.status) {
	case 'completed':
	// If completed, format and return the extracted data.
	return status.data ? formatData(status.data) : "Error: No data in completed response";
	case 'failed':
	return "Error: Extraction failed";
	case 'cancelled':
	return "Error: Extraction was cancelled";
	case 'pending':
	case 'processing':
	// Job is still in progress, continue polling.
	break;
	default:
	// For any unknown status, return an error.
	return `Error: Unknown status - ${status.status}`;
	}

	// Increment the attempt counter before the next loop iteration.
	currentAttempt++;
	}

	// If the loop finishes without completing the extraction, return a timeout error.
	return "Error: Extraction timed out after maximum attempts";
	}

	/**
	* Checks the status of an extraction job by querying the Firecrawl API.
	*
	* @param {string} extractionId - The ID of the extraction job to check.
	* @returns {Object} - An object containing job status, data, and any error message.
	*/
	function checkExtractionStatus(extractionId) {
	// Construct the status endpoint URL using the extraction ID.
	const endpoint = `${API_CONFIG.baseUrl}/extract/${extractionId}`;
	const options = {
	'method': 'get',
	'headers': {
	'Authorization': 'Bearer ' + API_CONFIG.key // Use the API key for authentication
	},
	'muteHttpExceptions': true, // Prevent exceptions on HTTP error codes
	'timeout': API_CONFIG.timeout // Set request timeout
	};

	try {
	// Send GET request to check job status.
	const response = UrlFetchApp.fetch(endpoint, options);
	const responseCode = response.getResponseCode();

	// If response is successful, parse and return status and data.
	if (responseCode >= 200 && responseCode < 300) {
	const responseData = JSON.parse(response.getContentText());
	return {
	status: responseData.status, // Job status: completed, pending, failed, etc.
	data: responseData.data, // Extracted data if available
	error: null
	};
	}

	// If HTTP error, return an error object with details.
	return {
	status: 'error',
	data: null,
	error: `HTTP Error: ${responseCode} - ${response.getContentText()}`
	};
	} catch (e) {
	// If exception occurs, return it as part of the status object.
	return {
	status: 'error',
	data: null,
	error: e.message
	};
	}
	}

	/**
	* Recursively formats the extracted data into a flat, comma-separated string.
	* Handles various data types including objects, arrays, and primitives.
	*
	* @param {any} data - The data to format (could be an object, array, or primitive).
	* @param {string} indent - Optional indentation for nested structures (not used in current logic).
	* @returns {string} - A formatted string representation of the data.
	*/
	function formatData(data, indent = '') {
	// If the data is null or undefined, return "Not found".
	if (data === null \|\| data === undefined) {
	return "Not found";
	}

	// If the data is a simple type (string, number, etc.), convert it to string.
	if (typeof data !== 'object') {
	return String(data);
	}

	// If the data is an array, recursively format each item and join with commas.
	if (Array.isArray(data)) {
	return data.map(item => formatData(item)).join(', ');
	}

	// If the data is an object, iterate its properties and format each value.
	if (typeof data === 'object') {
	const result = [];
	for (const key in data) {
	// Only process own properties, not inherited ones.
	if (data.hasOwnProperty(key)) {
	result.push(formatData(data[key]));
	}
	}
	// Join all formatted values with commas.
	return result.join(', ');
	}

	// Fallback: convert whatever remains to a string.
	return String(data);
	}

	/**
	* Processes and validates input URLs.
	* Cleans up the input, removes unnecessary parts, and ensures it's a valid domain.
	*
	* @param {string} input - Raw input which might be a URL, domain, or even an email.
	* @returns {string\|null} - A sanitized URL starting with 'https://' or null if invalid.
	*/
	function processInput(input) {
	// Check that input is a non-empty string.
	if (!input \|\| typeof input !== 'string') {
	return null;
	}

	// Trim whitespace from both ends of the input.
	input = input.trim();

	// If input contains an '@', assume it's an email and extract domain part.
	if (input.includes('@')) {
	input = input.split('@')[1];
	}

	// Remove common URL prefixes such as "http://", "https://", "www."
	input = input.replace(/^(https?:\/\/)?(www\.)?/, '');

	// Validate that the remaining input looks like a valid domain.
	const domainRegex = /^[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$/;
	if (!domainRegex.test(input)) {
	return null;
	}

	// Prepend "https://" to form a proper URL and return it.
	return 'https://' + input;
	}

	/**
	* Centralized error handling that formats exceptions into user-friendly messages.
	*
	* @param {Error} error - The error object caught during execution.
	* @returns {string} - A formatted error message suitable for display.
	*/
	function handleError(error) {
	// Check for common timeout-related error messages and return a specific message.
	if (error.message.includes('timeout') \|\| error.message.includes('Exceeded maximum execution time')) {
	return "Error: Request timed out";
	}
	// For other errors, prefix with "Exception:" to indicate an unexpected issue.
	return "Exception: " + error.message;
	}