Skip to content

Instantly share code, notes, and snippets.

@angezanetti
Created March 26, 2025 20:32
Show Gist options
  • Save angezanetti/1842cc23fb0d793da7341d7c6c586083 to your computer and use it in GitHub Desktop.
Save angezanetti/1842cc23fb0d793da7341d7c6c586083 to your computer and use it in GitHub Desktop.
// scraper-icp.js (ES module)
// ============================================
// Import required modules
// ============================================
import fs from 'fs';
import csv from 'csv-parser';
import axios from 'axios';
import { load } from 'cheerio';
import pLimit from 'p-limit';
import TurndownService from 'turndown';
// ============================================
// Define CSV column names mapping
// ============================================
const CSV_COLUMNS = {
firstName: "first_name",
lastName: "last_name",
jobTitle: "job_title",
companyName: "company",
email: "email",
employees: "employees",
industry: "industry",
linkedinUrl: "linkedin_url",
website: "company_website",
city: "city",
country: "country"
};
// ============================================
// Configuration and initialization
// ============================================
// OpenAI API configuration (ensure you have set OPENAI_API_KEY in your environment)
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
const OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions';
// Limit concurrent lead processing (e.g., 10 at a time)
const concurrencyLimit = 10;
const limit = pLimit(concurrencyLimit);
// Initialize Turndown service to convert HTML to Markdown
const turndownService = new TurndownService();
// Maximum allowed length for Markdown before summarizing it
const MAX_MARKDOWN_LENGTH = 10000;
// Maximum input length for the summary prompt (only first 3000 characters are sent)
const SUMMARY_INPUT_LIMIT = 3000;
// Key used for the extracted Ideal Customer Profile (ICP)
const TARGET_KEY = "ICP";
// Output CSV file path
const outputCsvFile = 'output.csv';
// Create a writable stream for the output CSV and write the header row
const outputCsvStream = fs.createWriteStream(outputCsvFile, { flags: 'w' });
const csvHeader = [
CSV_COLUMNS.firstName,
CSV_COLUMNS.lastName,
CSV_COLUMNS.jobTitle,
CSV_COLUMNS.companyName,
CSV_COLUMNS.email,
CSV_COLUMNS.employees,
CSV_COLUMNS.industry,
CSV_COLUMNS.linkedinUrl,
CSV_COLUMNS.website,
CSV_COLUMNS.city,
CSV_COLUMNS.country,
TARGET_KEY
].join(',') + "\n";
outputCsvStream.write(csvHeader);
// ============================================
// CSV value escape function
// ============================================
// Ensures that CSV values are properly escaped (e.g., for commas and quotes)
function csvEscape(value) {
if (value == null) return '';
value = value.toString();
if (value.includes('"')) {
value = value.replace(/"/g, '""');
}
if (value.includes(',') || value.includes('\n') || value.includes('"')) {
value = `"${value}"`;
}
return value;
}
// ============================================
// Function to extract a JSON object from a GPT response
// ============================================
function parseJSONResponse(text) {
// Look for the first JSON object occurrence in the text
const jsonMatch = text.match(/{[\s\S]*}/);
if (jsonMatch) {
try {
return JSON.parse(jsonMatch[0]);
} catch (err) {
console.error("Error parsing JSON from response:", err);
return null;
}
} else {
console.error("No JSON object found in response:", text);
return null;
}
}
// ============================================
// Function to summarize long Markdown via GPT-3.5
// ============================================
async function summarizeMarkdown(markdownText) {
// Truncate the text to the allowed input limit first
const markdownToSummarize = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
const summaryPrompt = `
You are an expert in content summarization. Please provide a concise summary of the following Markdown text. Ensure the summary is well under 10,000 characters.
-------------------------------------------------------
${markdownToSummarize}
-------------------------------------------------------
Provide only the summary in Markdown.
`;
try {
const response = await axios.post(
OPENAI_API_URL,
{
model: 'gpt-3.5-turbo',
messages: [{ role: 'user', content: summaryPrompt }],
temperature: 0.5,
max_tokens: 1500,
},
{
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${OPENAI_API_KEY}`,
},
}
);
const summary = response.data.choices[0].message.content;
console.log("Generated summary for long Markdown.");
return summary;
} catch (error) {
console.error(
"Error generating summary:",
error.response ? error.response.data : error.message
);
// Return the original text if summarization fails
return markdownText;
}
}
// ============================================
// Function to call GPT and get the Ideal Customer Profile (ICP)
// ============================================
async function getICPFromChatGPT(markdownText, leadData) {
const prompt = `
You're an expert in website analysis and marketing strategy.
Based on the content extracted from a company website (in Markdown) below:
-------------------------------------------------------
${markdownText}
-------------------------------------------------------
and the following lead information:
First Name: ${leadData.firstName}
Last Name: ${leadData.lastName}
Job Title: ${leadData.jobTitle}
Company: ${leadData.companyName}
Email: ${leadData.email}
Employees: ${leadData.employees}
Industry: ${leadData.industry}
Linkedin URL: ${leadData.linkedinUrl}
Website: ${leadData.website}
City: ${leadData.city}
Country: ${leadData.country}
Determine the Ideal Customer Profile (ICP) for this company.
Provide only a JSON object with the key "ICP" whose value is a short description (less than 4 words) in ENGLISH.
If no relevant information is found, provide a default harmonious value.
`;
try {
const response = await axios.post(
OPENAI_API_URL,
{
model: 'gpt-3.5-turbo',
messages: [{ role: 'user', content: prompt }],
temperature: 0.7,
max_tokens: 150,
},
{
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${OPENAI_API_KEY}`,
},
}
);
const reply = response.data.choices[0].message.content;
return parseJSONResponse(reply);
} catch (error) {
console.error(
'Error calling OpenAI:',
error.response ? error.response.data : error.message
);
return null;
}
}
// ============================================
// Function to fetch the website HTML, convert it to Markdown, and clean it up
// ============================================
async function scrapeWebsite(url) {
try {
const response = await axios.get(url, {
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
});
const html = response.data;
console.log(`Fetched HTML (${html.length} characters) from ${url}`);
const $ = load(html);
const bodyHTML = $('body').html() || "";
// Convert HTML body to Markdown
const markdown = turndownService.turndown(bodyHTML);
// Clean the Markdown by removing extra whitespace
const cleanedMarkdown = markdown.replace(/\s+/g, ' ').trim();
console.log(`Generated Markdown (${cleanedMarkdown.length} characters) for ${url}`);
return cleanedMarkdown;
} catch (error) {
console.error(`Error fetching HTML from ${url}:`, error.message);
return "";
}
}
// ============================================
// Function to process an individual lead
// ============================================
async function processLead(lead, index, total) {
// Extract lead data from CSV using CSV_COLUMNS mapping
const {
[CSV_COLUMNS.firstName]: firstName = "",
[CSV_COLUMNS.lastName]: lastName = "",
[CSV_COLUMNS.jobTitle]: jobTitle = "",
[CSV_COLUMNS.companyName]: companyName = "",
[CSV_COLUMNS.email]: email = "",
[CSV_COLUMNS.employees]: employees = "",
[CSV_COLUMNS.industry]: industry = "",
[CSV_COLUMNS.linkedinUrl]: linkedinUrl = "",
[CSV_COLUMNS.website]: websiteRaw = "",
[CSV_COLUMNS.city]: city = "",
[CSV_COLUMNS.country]: country = ""
} = lead;
console.log(`Raw URL value for ${companyName}: "${websiteRaw}"`);
// Trim and validate the website URL
const trimmedWebsite = websiteRaw ? websiteRaw.trim() : "";
const website = trimmedWebsite
? (trimmedWebsite.startsWith("http") ? trimmedWebsite : `http://${trimmedWebsite}`)
: "";
if (!website) {
console.warn(`Invalid URL for ${companyName}. Trimmed value: "${trimmedWebsite}"`);
}
const leadData = { firstName, lastName, jobTitle, companyName, email, employees, industry, linkedinUrl, website, city, country };
console.log(`Processing lead ${index + 1} / ${total}: ${companyName}`);
if (website) {
console.log(`Scraping URL: ${website}`);
}
// Fetch the website content and convert it to Markdown
let markdownText = "";
if (website) {
markdownText = await scrapeWebsite(website);
// If the Markdown is too long, summarize it via GPT-3.5
if (markdownText.length > MAX_MARKDOWN_LENGTH) {
console.log(`Markdown for ${companyName} exceeds ${MAX_MARKDOWN_LENGTH} characters. Summarizing...`);
const truncatedMarkdown = markdownText.slice(0, SUMMARY_INPUT_LIMIT);
markdownText = await summarizeMarkdown(truncatedMarkdown);
console.log(`Summary obtained for ${companyName} (${markdownText.length} characters).`);
}
} else {
console.warn(`No website content scraped for ${companyName} because URL is empty.`);
}
// Call GPT to extract the ICP
const icpObj = await getICPFromChatGPT(markdownText, leadData);
let icpValue = "";
if (icpObj && icpObj[TARGET_KEY]) {
icpValue = icpObj[TARGET_KEY];
console.log(`ICP for ${companyName}:`, icpValue);
} else {
console.log(`No ICP found for ${companyName}`);
}
// Prepare CSV row data
const values = [
csvEscape(firstName),
csvEscape(lastName),
csvEscape(jobTitle),
csvEscape(companyName),
csvEscape(email),
csvEscape(employees),
csvEscape(industry),
csvEscape(linkedinUrl),
csvEscape(website),
csvEscape(city),
csvEscape(country),
csvEscape(icpValue)
];
const csvRow = values.join(',') + "\n";
outputCsvStream.write(csvRow);
// Short delay between processing leads
await new Promise((resolve) => setTimeout(resolve, 500));
}
// ============================================
// Main function to read the CSV and process leads in parallel
// ============================================
async function processLeads(csvFilePath) {
return new Promise((resolve, reject) => {
const tasks = [];
let totalLeads = 0;
fs.createReadStream(csvFilePath)
.pipe(csv())
.on('data', (row) => {
totalLeads++;
// Use pLimit to limit concurrency
tasks.push(limit(() => processLead(row, totalLeads - tasks.length - 1, totalLeads)));
})
.on('end', async () => {
console.log(`CSV read complete, processing ${totalLeads} leads.`);
await Promise.all(tasks);
console.log('Processing finished.');
resolve();
})
.on('error', (err) => {
console.error("Error reading CSV:", err);
reject(err);
});
});
}
// ============================================
// Start processing: run the script with a CSV file path as an argument
// ============================================
const csvFilePath = process.argv[2];
if (!csvFilePath) {
console.error("Please specify the path to the CSV file as an argument.");
process.exit(1);
}
processLeads(csvFilePath)
.then(() => {
console.log(`Results have been written to ${outputCsvFile}`);
outputCsvStream.end();
})
.catch((err) => {
console.error("Error during processing:", err);
outputCsvStream.end();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment