angezanetti · March 26, 2025 20:32
diff --git a/scrap_ICP.js b/scrap_ICP.js
 // scraper-icp.js (ES module)

 // ============================================
 // Import required modules
 // ============================================
 import fs from 'fs';
 import csv from 'csv-parser';
 import axios from 'axios';
 import { load } from 'cheerio';
 import pLimit from 'p-limit';
 import TurndownService from 'turndown';

 // ============================================
 // Define CSV column names mapping
 // ============================================
 const CSV_COLUMNS = {
  firstName: "first_name",
  lastName: "last_name",
  jobTitle: "job_title",
  companyName: "company",
  email: "email",
  employees: "employees",
  industry: "industry",
  linkedinUrl: "linkedin_url",
  website: "company_website", 
  city: "city",
  country: "country"
 };

 // ============================================
 // Configuration and initialization
 // ============================================
 // OpenAI API configuration (ensure you have set OPENAI_API_KEY in your environment)
 const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
 const OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions';

 // Limit concurrent lead processing (e.g., 10 at a time)
 const concurrencyLimit = 10;
 const limit = pLimit(concurrencyLimit);

 // Initialize Turndown service to convert HTML to Markdown
 const turndownService = new TurndownService();

 // Maximum allowed length for Markdown before summarizing it
 const MAX_MARKDOWN_LENGTH = 10000;
 // Maximum input length for the summary prompt (only first 3000 characters are sent)
 const SUMMARY_INPUT_LIMIT = 3000;

 // Key used for the extracted Ideal Customer Profile (ICP)
 const TARGET_KEY = "ICP";

 // Output CSV file path
 const outputCsvFile = 'output.csv';

 // Create a writable stream for the output CSV and write the header row
 const outputCsvStream = fs.createWriteStream(outputCsvFile, { flags: 'w' });
 const csvHeader = [
  CSV_COLUMNS.firstName,
  CSV_COLUMNS.lastName,
  CSV_COLUMNS.jobTitle,
  CSV_COLUMNS.companyName,
  CSV_COLUMNS.email,
  CSV_COLUMNS.employees,
  CSV_COLUMNS.industry,
  CSV_COLUMNS.linkedinUrl,
  CSV_COLUMNS.website,
  CSV_COLUMNS.city,
  CSV_COLUMNS.country,
  TARGET_KEY
 ].join(',') + "\n";
 outputCsvStream.write(csvHeader);

 // ============================================
 // CSV value escape function
 // ============================================
 // Ensures that CSV values are properly escaped (e.g., for commas and quotes)
 function csvEscape(value) {
  if (value == null) return '';
  value = value.toString();
  if (value.includes('"')) {
    value = value.replace(/"/g, '""');
  }
  if (value.includes(',') || value.includes('\n') || value.includes('"')) {
    value = `"${value}"`;
  }
  return value;
 }

 // ============================================
 // Function to extract a JSON object from a GPT response
 // ============================================
 function parseJSONResponse(text) {
  // Look for the first JSON object occurrence in the text
  const jsonMatch = text.match(/{[\s\S]*}/);
  if (jsonMatch) {
    try {
      return JSON.parse(jsonMatch[0]);
    } catch (err) {
      console.error("Error parsing JSON from response:", err);
      return null;
    }
  } else {
    console.error("No JSON object found in response:", text);
    return null;
  }
 }

 // ============================================
 // Function to summarize long Markdown via GPT-3.5
 // ============================================
 async function summarizeMarkdown(markdownText) {
  // Truncate the text to the allowed input limit first
  const markdownToSummarize = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
  const summaryPrompt = `
 You are an expert in content summarization. Please provide a concise summary of the following Markdown text. Ensure the summary is well under 10,000 characters.
 -------------------------------------------------------
 ${markdownToSummarize}
 -------------------------------------------------------
 Provide only the summary in Markdown.
 `;
  try {
    const response = await axios.post(
      OPENAI_API_URL,
      {
        model: 'gpt-3.5-turbo',
        messages: [{ role: 'user', content: summaryPrompt }],
        temperature: 0.5,
        max_tokens: 1500,
      },
      {
        headers: {
          'Content-Type': 'application/json',
          Authorization: `Bearer ${OPENAI_API_KEY}`,
        },
      }
    );
    const summary = response.data.choices[0].message.content;
    console.log("Generated summary for long Markdown.");
    return summary;
  } catch (error) {
    console.error(
      "Error generating summary:",
      error.response ? error.response.data : error.message
    );
    // Return the original text if summarization fails
    return markdownText;
  }
 }

 // ============================================
 // Function to call GPT and get the Ideal Customer Profile (ICP)
 // ============================================
 async function getICPFromChatGPT(markdownText, leadData) {
  const prompt = `
 You're an expert in website analysis and marketing strategy.
 Based on the content extracted from a company website (in Markdown) below:
 -------------------------------------------------------
 ${markdownText}
 -------------------------------------------------------
 and the following lead information:
 First Name: ${leadData.firstName}
 Last Name: ${leadData.lastName}
 Job Title: ${leadData.jobTitle}
 Company: ${leadData.companyName}
 Email: ${leadData.email}
 Employees: ${leadData.employees}
 Industry: ${leadData.industry}
 Linkedin URL: ${leadData.linkedinUrl}
 Website: ${leadData.website}
 City: ${leadData.city}
 Country: ${leadData.country}

 Determine the Ideal Customer Profile (ICP) for this company.
 Provide only a JSON object with the key "ICP" whose value is a short description (less than 4 words) in ENGLISH.
 If no relevant information is found, provide a default harmonious value.
 `;
  try {
    const response = await axios.post(
      OPENAI_API_URL,
      {
        model: 'gpt-3.5-turbo',
        messages: [{ role: 'user', content: prompt }],
        temperature: 0.7,
        max_tokens: 150,
      },
      {
        headers: {
          'Content-Type': 'application/json',
          Authorization: `Bearer ${OPENAI_API_KEY}`,
        },
      }
    );
    const reply = response.data.choices[0].message.content;
    return parseJSONResponse(reply);
  } catch (error) {
    console.error(
      'Error calling OpenAI:',
      error.response ? error.response.data : error.message
    );
    return null;
  }
 }

 // ============================================
 // Function to fetch the website HTML, convert it to Markdown, and clean it up
 // ============================================
 async function scrapeWebsite(url) {
  try {
    const response = await axios.get(url, { 
      timeout: 30000,
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
      }
    });
    const html = response.data;
    console.log(`Fetched HTML (${html.length} characters) from ${url}`);
    const $ = load(html);
    const bodyHTML = $('body').html() || "";
    
    // Convert HTML body to Markdown
    const markdown = turndownService.turndown(bodyHTML);
    // Clean the Markdown by removing extra whitespace
    const cleanedMarkdown = markdown.replace(/\s+/g, ' ').trim();
    console.log(`Generated Markdown (${cleanedMarkdown.length} characters) for ${url}`);
    return cleanedMarkdown;
  } catch (error) {
    console.error(`Error fetching HTML from ${url}:`, error.message);
    return "";
  }
 }

 // ============================================
 // Function to process an individual lead
 // ============================================
 async function processLead(lead, index, total) {
  // Extract lead data from CSV using CSV_COLUMNS mapping
  const {
    [CSV_COLUMNS.firstName]: firstName = "",
    [CSV_COLUMNS.lastName]: lastName = "",
    [CSV_COLUMNS.jobTitle]: jobTitle = "",
    [CSV_COLUMNS.companyName]: companyName = "",
    [CSV_COLUMNS.email]: email = "",
    [CSV_COLUMNS.employees]: employees = "",
    [CSV_COLUMNS.industry]: industry = "",
    [CSV_COLUMNS.linkedinUrl]: linkedinUrl = "",
    [CSV_COLUMNS.website]: websiteRaw = "",
    [CSV_COLUMNS.city]: city = "",
    [CSV_COLUMNS.country]: country = ""
  } = lead;

  console.log(`Raw URL value for ${companyName}: "${websiteRaw}"`);
  
  // Trim and validate the website URL
  const trimmedWebsite = websiteRaw ? websiteRaw.trim() : "";
  const website = trimmedWebsite 
    ? (trimmedWebsite.startsWith("http") ? trimmedWebsite : `http://${trimmedWebsite}`)
    : "";
  
  if (!website) {
    console.warn(`Invalid URL for ${companyName}. Trimmed value: "${trimmedWebsite}"`);
  }

  const leadData = { firstName, lastName, jobTitle, companyName, email, employees, industry, linkedinUrl, website, city, country };

  console.log(`Processing lead ${index + 1} / ${total}: ${companyName}`);
  if (website) {
    console.log(`Scraping URL: ${website}`);
  }

  // Fetch the website content and convert it to Markdown
  let markdownText = "";
  if (website) {
    markdownText = await scrapeWebsite(website);
    // If the Markdown is too long, summarize it via GPT-3.5
    if (markdownText.length > MAX_MARKDOWN_LENGTH) {
      console.log(`Markdown for ${companyName} exceeds ${MAX_MARKDOWN_LENGTH} characters. Summarizing...`);
      const truncatedMarkdown = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
      markdownText = await summarizeMarkdown(truncatedMarkdown);
      console.log(`Summary obtained for ${companyName} (${markdownText.length} characters).`);
    }
  } else {
    console.warn(`No website content scraped for ${companyName} because URL is empty.`);
  }

  // Call GPT to extract the ICP
  const icpObj = await getICPFromChatGPT(markdownText, leadData);
  let icpValue = "";
  if (icpObj && icpObj[TARGET_KEY]) {
    icpValue = icpObj[TARGET_KEY];
    console.log(`ICP for ${companyName}:`, icpValue);
  } else {
    console.log(`No ICP found for ${companyName}`);
  }

  // Prepare CSV row data
  const values = [
    csvEscape(firstName),
    csvEscape(lastName),
    csvEscape(jobTitle),
    csvEscape(companyName),
    csvEscape(email),
    csvEscape(employees),
    csvEscape(industry),
    csvEscape(linkedinUrl),
    csvEscape(website),
    csvEscape(city),
    csvEscape(country),
    csvEscape(icpValue)
  ];
  
  const csvRow = values.join(',') + "\n";
  outputCsvStream.write(csvRow);
  // Short delay between processing leads
  await new Promise((resolve) => setTimeout(resolve, 500));
 }

 // ============================================
 // Main function to read the CSV and process leads in parallel
 // ============================================
 async function processLeads(csvFilePath) {
  return new Promise((resolve, reject) => {
    const tasks = [];
    let totalLeads = 0;

    fs.createReadStream(csvFilePath)
      .pipe(csv())
      .on('data', (row) => {
        totalLeads++;
        // Use pLimit to limit concurrency
        tasks.push(limit(() => processLead(row, totalLeads - tasks.length - 1, totalLeads)));
      })
      .on('end', async () => {
        console.log(`CSV read complete, processing ${totalLeads} leads.`);
        await Promise.all(tasks);
        console.log('Processing finished.');
        resolve();
      })
      .on('error', (err) => {
        console.error("Error reading CSV:", err);
        reject(err);
      });
  });
 }

 // ============================================
 // Start processing: run the script with a CSV file path as an argument
 // ============================================
 const csvFilePath = process.argv[2];
 if (!csvFilePath) {
  console.error("Please specify the path to the CSV file as an argument.");
  process.exit(1);
 }

 processLeads(csvFilePath)
  .then(() => {
    console.log(`Results have been written to ${outputCsvFile}`);
    outputCsvStream.end();
  })
  .catch((err) => {
    console.error("Error during processing:", err);
    outputCsvStream.end();
  });
	// scraper-icp.js (ES module)

	// ============================================
	// Import required modules
	// ============================================
	import fs from 'fs';
	import csv from 'csv-parser';
	import axios from 'axios';
	import { load } from 'cheerio';
	import pLimit from 'p-limit';
	import TurndownService from 'turndown';

	// ============================================
	// Define CSV column names mapping
	// ============================================
	const CSV_COLUMNS = {
	firstName: "first_name",
	lastName: "last_name",
	jobTitle: "job_title",
	companyName: "company",
	email: "email",
	employees: "employees",
	industry: "industry",
	linkedinUrl: "linkedin_url",
	website: "company_website",
	city: "city",
	country: "country"
	};

	// ============================================
	// Configuration and initialization
	// ============================================
	// OpenAI API configuration (ensure you have set OPENAI_API_KEY in your environment)
	const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
	const OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions';

	// Limit concurrent lead processing (e.g., 10 at a time)
	const concurrencyLimit = 10;
	const limit = pLimit(concurrencyLimit);

	// Initialize Turndown service to convert HTML to Markdown
	const turndownService = new TurndownService();

	// Maximum allowed length for Markdown before summarizing it
	const MAX_MARKDOWN_LENGTH = 10000;
	// Maximum input length for the summary prompt (only first 3000 characters are sent)
	const SUMMARY_INPUT_LIMIT = 3000;

	// Key used for the extracted Ideal Customer Profile (ICP)
	const TARGET_KEY = "ICP";

	// Output CSV file path
	const outputCsvFile = 'output.csv';

	// Create a writable stream for the output CSV and write the header row
	const outputCsvStream = fs.createWriteStream(outputCsvFile, { flags: 'w' });
	const csvHeader = [
	CSV_COLUMNS.firstName,
	CSV_COLUMNS.lastName,
	CSV_COLUMNS.jobTitle,
	CSV_COLUMNS.companyName,
	CSV_COLUMNS.email,
	CSV_COLUMNS.employees,
	CSV_COLUMNS.industry,
	CSV_COLUMNS.linkedinUrl,
	CSV_COLUMNS.website,
	CSV_COLUMNS.city,
	CSV_COLUMNS.country,
	TARGET_KEY
	].join(',') + "\n";
	outputCsvStream.write(csvHeader);

	// ============================================
	// CSV value escape function
	// ============================================
	// Ensures that CSV values are properly escaped (e.g., for commas and quotes)
	function csvEscape(value) {
	if (value == null) return '';
	value = value.toString();
	if (value.includes('"')) {
	value = value.replace(/"/g, '""');
	}
	if (value.includes(',') \|\| value.includes('\n') \|\| value.includes('"')) {
	value = `"${value}"`;
	}
	return value;
	}

	// ============================================
	// Function to extract a JSON object from a GPT response
	// ============================================
	function parseJSONResponse(text) {
	// Look for the first JSON object occurrence in the text
	const jsonMatch = text.match(/{[\s\S]*}/);
	if (jsonMatch) {
	try {
	return JSON.parse(jsonMatch[0]);
	} catch (err) {
	console.error("Error parsing JSON from response:", err);
	return null;
	}
	} else {
	console.error("No JSON object found in response:", text);
	return null;
	}
	}

	// ============================================
	// Function to summarize long Markdown via GPT-3.5
	// ============================================
	async function summarizeMarkdown(markdownText) {
	// Truncate the text to the allowed input limit first
	const markdownToSummarize = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
	const summaryPrompt = `
	You are an expert in content summarization. Please provide a concise summary of the following Markdown text. Ensure the summary is well under 10,000 characters.
	-------------------------------------------------------
	${markdownToSummarize}
	-------------------------------------------------------
	Provide only the summary in Markdown.
	`;
	try {
	const response = await axios.post(
	OPENAI_API_URL,
	{
	model: 'gpt-3.5-turbo',
	messages: [{ role: 'user', content: summaryPrompt }],
	temperature: 0.5,
	max_tokens: 1500,
	},
	{
	headers: {
	'Content-Type': 'application/json',
	Authorization: `Bearer ${OPENAI_API_KEY}`,
	},
	}
	);
	const summary = response.data.choices[0].message.content;
	console.log("Generated summary for long Markdown.");
	return summary;
	} catch (error) {
	console.error(
	"Error generating summary:",
	error.response ? error.response.data : error.message
	);
	// Return the original text if summarization fails
	return markdownText;
	}
	}

	// ============================================
	// Function to call GPT and get the Ideal Customer Profile (ICP)
	// ============================================
	async function getICPFromChatGPT(markdownText, leadData) {
	const prompt = `
	You're an expert in website analysis and marketing strategy.
	Based on the content extracted from a company website (in Markdown) below:
	-------------------------------------------------------
	${markdownText}
	-------------------------------------------------------
	and the following lead information:
	First Name: ${leadData.firstName}
	Last Name: ${leadData.lastName}
	Job Title: ${leadData.jobTitle}
	Company: ${leadData.companyName}
	Email: ${leadData.email}
	Employees: ${leadData.employees}
	Industry: ${leadData.industry}
	Linkedin URL: ${leadData.linkedinUrl}
	Website: ${leadData.website}
	City: ${leadData.city}
	Country: ${leadData.country}

	Determine the Ideal Customer Profile (ICP) for this company.
	Provide only a JSON object with the key "ICP" whose value is a short description (less than 4 words) in ENGLISH.
	If no relevant information is found, provide a default harmonious value.
	`;
	try {
	const response = await axios.post(
	OPENAI_API_URL,
	{
	model: 'gpt-3.5-turbo',
	messages: [{ role: 'user', content: prompt }],
	temperature: 0.7,
	max_tokens: 150,
	},
	{
	headers: {
	'Content-Type': 'application/json',
	Authorization: `Bearer ${OPENAI_API_KEY}`,
	},
	}
	);
	const reply = response.data.choices[0].message.content;
	return parseJSONResponse(reply);
	} catch (error) {
	console.error(
	'Error calling OpenAI:',
	error.response ? error.response.data : error.message
	);
	return null;
	}
	}

	// ============================================
	// Function to fetch the website HTML, convert it to Markdown, and clean it up
	// ============================================
	async function scrapeWebsite(url) {
	try {
	const response = await axios.get(url, {
	timeout: 30000,
	headers: {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
	}
	});
	const html = response.data;
	console.log(`Fetched HTML (${html.length} characters) from ${url}`);
	const $ = load(html);
	const bodyHTML = $('body').html() \|\| "";

	// Convert HTML body to Markdown
	const markdown = turndownService.turndown(bodyHTML);
	// Clean the Markdown by removing extra whitespace
	const cleanedMarkdown = markdown.replace(/\s+/g, ' ').trim();
	console.log(`Generated Markdown (${cleanedMarkdown.length} characters) for ${url}`);
	return cleanedMarkdown;
	} catch (error) {
	console.error(`Error fetching HTML from ${url}:`, error.message);
	return "";
	}
	}

	// ============================================
	// Function to process an individual lead
	// ============================================
	async function processLead(lead, index, total) {
	// Extract lead data from CSV using CSV_COLUMNS mapping
	const {
	[CSV_COLUMNS.firstName]: firstName = "",
	[CSV_COLUMNS.lastName]: lastName = "",
	[CSV_COLUMNS.jobTitle]: jobTitle = "",
	[CSV_COLUMNS.companyName]: companyName = "",
	[CSV_COLUMNS.email]: email = "",
	[CSV_COLUMNS.employees]: employees = "",
	[CSV_COLUMNS.industry]: industry = "",
	[CSV_COLUMNS.linkedinUrl]: linkedinUrl = "",
	[CSV_COLUMNS.website]: websiteRaw = "",
	[CSV_COLUMNS.city]: city = "",
	[CSV_COLUMNS.country]: country = ""
	} = lead;

	console.log(`Raw URL value for ${companyName}: "${websiteRaw}"`);

	// Trim and validate the website URL
	const trimmedWebsite = websiteRaw ? websiteRaw.trim() : "";
	const website = trimmedWebsite
	? (trimmedWebsite.startsWith("http") ? trimmedWebsite : `http://${trimmedWebsite}`)
	: "";

	if (!website) {
	console.warn(`Invalid URL for ${companyName}. Trimmed value: "${trimmedWebsite}"`);
	}

	const leadData = { firstName, lastName, jobTitle, companyName, email, employees, industry, linkedinUrl, website, city, country };

	console.log(`Processing lead ${index + 1} / ${total}: ${companyName}`);
	if (website) {
	console.log(`Scraping URL: ${website}`);
	}

	// Fetch the website content and convert it to Markdown
	let markdownText = "";
	if (website) {
	markdownText = await scrapeWebsite(website);
	// If the Markdown is too long, summarize it via GPT-3.5
	if (markdownText.length > MAX_MARKDOWN_LENGTH) {
	console.log(`Markdown for ${companyName} exceeds ${MAX_MARKDOWN_LENGTH} characters. Summarizing...`);
	const truncatedMarkdown = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
	markdownText = await summarizeMarkdown(truncatedMarkdown);
	console.log(`Summary obtained for ${companyName} (${markdownText.length} characters).`);
	}
	} else {
	console.warn(`No website content scraped for ${companyName} because URL is empty.`);
	}

	// Call GPT to extract the ICP
	const icpObj = await getICPFromChatGPT(markdownText, leadData);
	let icpValue = "";
	if (icpObj && icpObj[TARGET_KEY]) {
	icpValue = icpObj[TARGET_KEY];
	console.log(`ICP for ${companyName}:`, icpValue);
	} else {
	console.log(`No ICP found for ${companyName}`);
	}

	// Prepare CSV row data
	const values = [
	csvEscape(firstName),
	csvEscape(lastName),
	csvEscape(jobTitle),
	csvEscape(companyName),
	csvEscape(email),
	csvEscape(employees),
	csvEscape(industry),
	csvEscape(linkedinUrl),
	csvEscape(website),
	csvEscape(city),
	csvEscape(country),
	csvEscape(icpValue)
	];

	const csvRow = values.join(',') + "\n";
	outputCsvStream.write(csvRow);
	// Short delay between processing leads
	await new Promise((resolve) => setTimeout(resolve, 500));
	}

	// ============================================
	// Main function to read the CSV and process leads in parallel
	// ============================================
	async function processLeads(csvFilePath) {
	return new Promise((resolve, reject) => {
	const tasks = [];
	let totalLeads = 0;

	fs.createReadStream(csvFilePath)
	.pipe(csv())
	.on('data', (row) => {
	totalLeads++;
	// Use pLimit to limit concurrency
	tasks.push(limit(() => processLead(row, totalLeads - tasks.length - 1, totalLeads)));
	})
	.on('end', async () => {
	console.log(`CSV read complete, processing ${totalLeads} leads.`);
	await Promise.all(tasks);
	console.log('Processing finished.');
	resolve();
	})
	.on('error', (err) => {
	console.error("Error reading CSV:", err);
	reject(err);
	});
	});
	}

	// ============================================
	// Start processing: run the script with a CSV file path as an argument
	// ============================================
	const csvFilePath = process.argv[2];
	if (!csvFilePath) {
	console.error("Please specify the path to the CSV file as an argument.");
	process.exit(1);
	}

	processLeads(csvFilePath)
	.then(() => {
	console.log(`Results have been written to ${outputCsvFile}`);
	outputCsvStream.end();
	})
	.catch((err) => {
	console.error("Error during processing:", err);
	outputCsvStream.end();
	});