Created
March 26, 2025 20:32
-
-
Save angezanetti/1842cc23fb0d793da7341d7c6c586083 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// scraper-icp.js (ES module) | |
// ============================================ | |
// Import required modules | |
// ============================================ | |
import fs from 'fs'; | |
import csv from 'csv-parser'; | |
import axios from 'axios'; | |
import { load } from 'cheerio'; | |
import pLimit from 'p-limit'; | |
import TurndownService from 'turndown'; | |
// ============================================ | |
// Define CSV column names mapping | |
// ============================================ | |
const CSV_COLUMNS = { | |
firstName: "first_name", | |
lastName: "last_name", | |
jobTitle: "job_title", | |
companyName: "company", | |
email: "email", | |
employees: "employees", | |
industry: "industry", | |
linkedinUrl: "linkedin_url", | |
website: "company_website", | |
city: "city", | |
country: "country" | |
}; | |
// ============================================ | |
// Configuration and initialization | |
// ============================================ | |
// OpenAI API configuration (ensure you have set OPENAI_API_KEY in your environment) | |
const OPENAI_API_KEY = process.env.OPENAI_API_KEY; | |
const OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions'; | |
// Limit concurrent lead processing (e.g., 10 at a time) | |
const concurrencyLimit = 10; | |
const limit = pLimit(concurrencyLimit); | |
// Initialize Turndown service to convert HTML to Markdown | |
const turndownService = new TurndownService(); | |
// Maximum allowed length for Markdown before summarizing it | |
const MAX_MARKDOWN_LENGTH = 10000; | |
// Maximum input length for the summary prompt (only first 3000 characters are sent) | |
const SUMMARY_INPUT_LIMIT = 3000; | |
// Key used for the extracted Ideal Customer Profile (ICP) | |
const TARGET_KEY = "ICP"; | |
// Output CSV file path | |
const outputCsvFile = 'output.csv'; | |
// Create a writable stream for the output CSV and write the header row | |
const outputCsvStream = fs.createWriteStream(outputCsvFile, { flags: 'w' }); | |
const csvHeader = [ | |
CSV_COLUMNS.firstName, | |
CSV_COLUMNS.lastName, | |
CSV_COLUMNS.jobTitle, | |
CSV_COLUMNS.companyName, | |
CSV_COLUMNS.email, | |
CSV_COLUMNS.employees, | |
CSV_COLUMNS.industry, | |
CSV_COLUMNS.linkedinUrl, | |
CSV_COLUMNS.website, | |
CSV_COLUMNS.city, | |
CSV_COLUMNS.country, | |
TARGET_KEY | |
].join(',') + "\n"; | |
outputCsvStream.write(csvHeader); | |
// ============================================ | |
// CSV value escape function | |
// ============================================ | |
// Ensures that CSV values are properly escaped (e.g., for commas and quotes) | |
function csvEscape(value) { | |
if (value == null) return ''; | |
value = value.toString(); | |
if (value.includes('"')) { | |
value = value.replace(/"/g, '""'); | |
} | |
if (value.includes(',') || value.includes('\n') || value.includes('"')) { | |
value = `"${value}"`; | |
} | |
return value; | |
} | |
// ============================================ | |
// Function to extract a JSON object from a GPT response | |
// ============================================ | |
function parseJSONResponse(text) { | |
// Look for the first JSON object occurrence in the text | |
const jsonMatch = text.match(/{[\s\S]*}/); | |
if (jsonMatch) { | |
try { | |
return JSON.parse(jsonMatch[0]); | |
} catch (err) { | |
console.error("Error parsing JSON from response:", err); | |
return null; | |
} | |
} else { | |
console.error("No JSON object found in response:", text); | |
return null; | |
} | |
} | |
// ============================================ | |
// Function to summarize long Markdown via GPT-3.5 | |
// ============================================ | |
async function summarizeMarkdown(markdownText) { | |
// Truncate the text to the allowed input limit first | |
const markdownToSummarize = markdownText.slice(0, SUMMARY_INPUT_LIMIT); | |
const summaryPrompt = ` | |
You are an expert in content summarization. Please provide a concise summary of the following Markdown text. Ensure the summary is well under 10,000 characters. | |
------------------------------------------------------- | |
${markdownToSummarize} | |
------------------------------------------------------- | |
Provide only the summary in Markdown. | |
`; | |
try { | |
const response = await axios.post( | |
OPENAI_API_URL, | |
{ | |
model: 'gpt-3.5-turbo', | |
messages: [{ role: 'user', content: summaryPrompt }], | |
temperature: 0.5, | |
max_tokens: 1500, | |
}, | |
{ | |
headers: { | |
'Content-Type': 'application/json', | |
Authorization: `Bearer ${OPENAI_API_KEY}`, | |
}, | |
} | |
); | |
const summary = response.data.choices[0].message.content; | |
console.log("Generated summary for long Markdown."); | |
return summary; | |
} catch (error) { | |
console.error( | |
"Error generating summary:", | |
error.response ? error.response.data : error.message | |
); | |
// Return the original text if summarization fails | |
return markdownText; | |
} | |
} | |
// ============================================ | |
// Function to call GPT and get the Ideal Customer Profile (ICP) | |
// ============================================ | |
async function getICPFromChatGPT(markdownText, leadData) { | |
const prompt = ` | |
You're an expert in website analysis and marketing strategy. | |
Based on the content extracted from a company website (in Markdown) below: | |
------------------------------------------------------- | |
${markdownText} | |
------------------------------------------------------- | |
and the following lead information: | |
First Name: ${leadData.firstName} | |
Last Name: ${leadData.lastName} | |
Job Title: ${leadData.jobTitle} | |
Company: ${leadData.companyName} | |
Email: ${leadData.email} | |
Employees: ${leadData.employees} | |
Industry: ${leadData.industry} | |
Linkedin URL: ${leadData.linkedinUrl} | |
Website: ${leadData.website} | |
City: ${leadData.city} | |
Country: ${leadData.country} | |
Determine the Ideal Customer Profile (ICP) for this company. | |
Provide only a JSON object with the key "ICP" whose value is a short description (less than 4 words) in ENGLISH. | |
If no relevant information is found, provide a default harmonious value. | |
`; | |
try { | |
const response = await axios.post( | |
OPENAI_API_URL, | |
{ | |
model: 'gpt-3.5-turbo', | |
messages: [{ role: 'user', content: prompt }], | |
temperature: 0.7, | |
max_tokens: 150, | |
}, | |
{ | |
headers: { | |
'Content-Type': 'application/json', | |
Authorization: `Bearer ${OPENAI_API_KEY}`, | |
}, | |
} | |
); | |
const reply = response.data.choices[0].message.content; | |
return parseJSONResponse(reply); | |
} catch (error) { | |
console.error( | |
'Error calling OpenAI:', | |
error.response ? error.response.data : error.message | |
); | |
return null; | |
} | |
} | |
// ============================================ | |
// Function to fetch the website HTML, convert it to Markdown, and clean it up | |
// ============================================ | |
async function scrapeWebsite(url) { | |
try { | |
const response = await axios.get(url, { | |
timeout: 30000, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' | |
} | |
}); | |
const html = response.data; | |
console.log(`Fetched HTML (${html.length} characters) from ${url}`); | |
const $ = load(html); | |
const bodyHTML = $('body').html() || ""; | |
// Convert HTML body to Markdown | |
const markdown = turndownService.turndown(bodyHTML); | |
// Clean the Markdown by removing extra whitespace | |
const cleanedMarkdown = markdown.replace(/\s+/g, ' ').trim(); | |
console.log(`Generated Markdown (${cleanedMarkdown.length} characters) for ${url}`); | |
return cleanedMarkdown; | |
} catch (error) { | |
console.error(`Error fetching HTML from ${url}:`, error.message); | |
return ""; | |
} | |
} | |
// ============================================ | |
// Function to process an individual lead | |
// ============================================ | |
async function processLead(lead, index, total) { | |
// Extract lead data from CSV using CSV_COLUMNS mapping | |
const { | |
[CSV_COLUMNS.firstName]: firstName = "", | |
[CSV_COLUMNS.lastName]: lastName = "", | |
[CSV_COLUMNS.jobTitle]: jobTitle = "", | |
[CSV_COLUMNS.companyName]: companyName = "", | |
[CSV_COLUMNS.email]: email = "", | |
[CSV_COLUMNS.employees]: employees = "", | |
[CSV_COLUMNS.industry]: industry = "", | |
[CSV_COLUMNS.linkedinUrl]: linkedinUrl = "", | |
[CSV_COLUMNS.website]: websiteRaw = "", | |
[CSV_COLUMNS.city]: city = "", | |
[CSV_COLUMNS.country]: country = "" | |
} = lead; | |
console.log(`Raw URL value for ${companyName}: "${websiteRaw}"`); | |
// Trim and validate the website URL | |
const trimmedWebsite = websiteRaw ? websiteRaw.trim() : ""; | |
const website = trimmedWebsite | |
? (trimmedWebsite.startsWith("http") ? trimmedWebsite : `http://${trimmedWebsite}`) | |
: ""; | |
if (!website) { | |
console.warn(`Invalid URL for ${companyName}. Trimmed value: "${trimmedWebsite}"`); | |
} | |
const leadData = { firstName, lastName, jobTitle, companyName, email, employees, industry, linkedinUrl, website, city, country }; | |
console.log(`Processing lead ${index + 1} / ${total}: ${companyName}`); | |
if (website) { | |
console.log(`Scraping URL: ${website}`); | |
} | |
// Fetch the website content and convert it to Markdown | |
let markdownText = ""; | |
if (website) { | |
markdownText = await scrapeWebsite(website); | |
// If the Markdown is too long, summarize it via GPT-3.5 | |
if (markdownText.length > MAX_MARKDOWN_LENGTH) { | |
console.log(`Markdown for ${companyName} exceeds ${MAX_MARKDOWN_LENGTH} characters. Summarizing...`); | |
const truncatedMarkdown = markdownText.slice(0, SUMMARY_INPUT_LIMIT); | |
markdownText = await summarizeMarkdown(truncatedMarkdown); | |
console.log(`Summary obtained for ${companyName} (${markdownText.length} characters).`); | |
} | |
} else { | |
console.warn(`No website content scraped for ${companyName} because URL is empty.`); | |
} | |
// Call GPT to extract the ICP | |
const icpObj = await getICPFromChatGPT(markdownText, leadData); | |
let icpValue = ""; | |
if (icpObj && icpObj[TARGET_KEY]) { | |
icpValue = icpObj[TARGET_KEY]; | |
console.log(`ICP for ${companyName}:`, icpValue); | |
} else { | |
console.log(`No ICP found for ${companyName}`); | |
} | |
// Prepare CSV row data | |
const values = [ | |
csvEscape(firstName), | |
csvEscape(lastName), | |
csvEscape(jobTitle), | |
csvEscape(companyName), | |
csvEscape(email), | |
csvEscape(employees), | |
csvEscape(industry), | |
csvEscape(linkedinUrl), | |
csvEscape(website), | |
csvEscape(city), | |
csvEscape(country), | |
csvEscape(icpValue) | |
]; | |
const csvRow = values.join(',') + "\n"; | |
outputCsvStream.write(csvRow); | |
// Short delay between processing leads | |
await new Promise((resolve) => setTimeout(resolve, 500)); | |
} | |
// ============================================ | |
// Main function to read the CSV and process leads in parallel | |
// ============================================ | |
async function processLeads(csvFilePath) { | |
return new Promise((resolve, reject) => { | |
const tasks = []; | |
let totalLeads = 0; | |
fs.createReadStream(csvFilePath) | |
.pipe(csv()) | |
.on('data', (row) => { | |
totalLeads++; | |
// Use pLimit to limit concurrency | |
tasks.push(limit(() => processLead(row, totalLeads - tasks.length - 1, totalLeads))); | |
}) | |
.on('end', async () => { | |
console.log(`CSV read complete, processing ${totalLeads} leads.`); | |
await Promise.all(tasks); | |
console.log('Processing finished.'); | |
resolve(); | |
}) | |
.on('error', (err) => { | |
console.error("Error reading CSV:", err); | |
reject(err); | |
}); | |
}); | |
} | |
// ============================================ | |
// Start processing: run the script with a CSV file path as an argument | |
// ============================================ | |
const csvFilePath = process.argv[2]; | |
if (!csvFilePath) { | |
console.error("Please specify the path to the CSV file as an argument."); | |
process.exit(1); | |
} | |
processLeads(csvFilePath) | |
.then(() => { | |
console.log(`Results have been written to ${outputCsvFile}`); | |
outputCsvStream.end(); | |
}) | |
.catch((err) => { | |
console.error("Error during processing:", err); | |
outputCsvStream.end(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment