shauvik · May 2, 2025 22:49
diff --git a/README.md b/README.md
diff --git a/scrape_proposals.js b/scrape_proposals.js
 const { chromium } = require('playwright');
 const fs = require('fs');
 const path = require('path');


 async function main() {
  // Read the CSV file
  const csvContent = fs.readFileSync('applications.csv', 'utf-8');
  const lines = csvContent.split('\n');
  
  // Extract URLs
  const urls = lines.map(line => {
    const parts = line.split(',');
    return parts[parts.length - 1].trim();
  }).filter(url => url.startsWith('http'));

  // Connect to an existing Chrome browser
  // You'll need to start Chrome with remote debugging enabled:
  // On macOS: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
  const browser = await chromium.connectOverCDP('http://localhost:9222');
  const context = browser.contexts()[0];
  const page = await context.newPage();

  // Create downloads directory if it doesn't exist
  const downloadsDir = path.join(__dirname, 'downloads');
  if (!fs.existsSync(downloadsDir)) {
    fs.mkdirSync(downloadsDir);
  }

  // Create index.csv file with header
  const indexFile = path.join(__dirname, 'proposals.csv');
  fs.writeFileSync(indexFile, 'proposal_id, name, email, title, file_name, url\n');

  // Visit each URL and download the file
  for (const url of urls) {
    try {
      console.log(`Visiting: ${url}`);
      await page.goto(url);
      
      // Wait for the link to be visible
      await page.waitForSelector('div.info__link a');
      
      // Get the href attribute
      const fileUrl = await page.$eval('div.info__link a', el => el.href);
      
      // Extract email
      const email = await page.$eval('a[title="Opens default mail application"]', el => el.innerText);
      
      // Extract name
      const name = await page.$eval('div.body__contributor div.ng-star-inserted', el => el.innerHTML.split('<')[0].trim());
      
      // Extract title
      const title = await page.$eval('h1.body__title', el => el.innerText);
      
      // Extract proposal ID from URL
      const urlParts = url.split('/');
      const proposalId = urlParts[urlParts.length - 1];
      
      // Download and save the file
      const fileName = `${name}-${proposalId}.pdf`;
      const filePath = path.join(downloadsDir, fileName);
      const getResp = await page.context().request.get(fileUrl);
      const buffer = await getResp.body();
      fs.writeFileSync(filePath, buffer);
      console.log(`Downloaded: ${fileName} and saved details for ${name}`);

      // Save to CSV
      fs.appendFileSync(indexFile, `${proposalId}, "${name}", ${email}, "${title}", "${fileName}", ${url}\n`);
      
      // Add a small delay between requests
      await page.waitForTimeout(2000);
    } catch (error) {
      console.error(`Error processing ${url}:`, error.message);
    }
  }

  await browser.close();
 }

 main().catch(console.error);
	const { chromium } = require('playwright');
	const fs = require('fs');
	const path = require('path');


	async function main() {
	// Read the CSV file
	const csvContent = fs.readFileSync('applications.csv', 'utf-8');
	const lines = csvContent.split('\n');

	// Extract URLs
	const urls = lines.map(line => {
	const parts = line.split(',');
	return parts[parts.length - 1].trim();
	}).filter(url => url.startsWith('http'));

	// Connect to an existing Chrome browser
	// You'll need to start Chrome with remote debugging enabled:
	// On macOS: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
	const browser = await chromium.connectOverCDP('http://localhost:9222');
	const context = browser.contexts()[0];
	const page = await context.newPage();

	// Create downloads directory if it doesn't exist
	const downloadsDir = path.join(__dirname, 'downloads');
	if (!fs.existsSync(downloadsDir)) {
	fs.mkdirSync(downloadsDir);
	}

	// Create index.csv file with header
	const indexFile = path.join(__dirname, 'proposals.csv');
	fs.writeFileSync(indexFile, 'proposal_id, name, email, title, file_name, url\n');

	// Visit each URL and download the file
	for (const url of urls) {
	try {
	console.log(`Visiting: ${url}`);
	await page.goto(url);

	// Wait for the link to be visible
	await page.waitForSelector('div.info__link a');

	// Get the href attribute
	const fileUrl = await page.$eval('div.info__link a', el => el.href);

	// Extract email
	const email = await page.$eval('a[title="Opens default mail application"]', el => el.innerText);

	// Extract name
	const name = await page.$eval('div.body__contributor div.ng-star-inserted', el => el.innerHTML.split('<')[0].trim());

	// Extract title
	const title = await page.$eval('h1.body__title', el => el.innerText);

	// Extract proposal ID from URL
	const urlParts = url.split('/');
	const proposalId = urlParts[urlParts.length - 1];

	// Download and save the file
	const fileName = `${name}-${proposalId}.pdf`;
	const filePath = path.join(downloadsDir, fileName);
	const getResp = await page.context().request.get(fileUrl);
	const buffer = await getResp.body();
	fs.writeFileSync(filePath, buffer);
	console.log(`Downloaded: ${fileName} and saved details for ${name}`);

	// Save to CSV
	fs.appendFileSync(indexFile, `${proposalId}, "${name}", ${email}, "${title}", "${fileName}", ${url}\n`);

	// Add a small delay between requests
	await page.waitForTimeout(2000);
	} catch (error) {
	console.error(`Error processing ${url}:`, error.message);
	}
	}

	await browser.close();
	}

	main().catch(console.error);