Skip to content

Instantly share code, notes, and snippets.

@mokshchadha
Created May 29, 2025 10:04
Show Gist options
  • Save mokshchadha/c56a68506ada2c89018a5701c990c71d to your computer and use it in GitHub Desktop.
Save mokshchadha/c56a68506ada2c89018a5701c990c71d to your computer and use it in GitHub Desktop.
Scrape IIM jobs using playwright and JS - May 2025
// Enhanced job scraper that opens individual job links and scrapes detailed data
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
async function scrapeJobsToCSV() {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
// Array to store all job data
const allJobsData = [];
try {
// Test with banking finance URL
const testUrl = 'https://www.iimjobs.com/c/filter/banking-finance-jobs-in-metros_anywhere%20in%20india_ahmedabad_amritsar_andhra%20pradesh_aurangabad_bangalore_bhubaneshwar_bihar_chandigarh_chennai_chhattisgarh_cochin-kochi_coimbatore_cuttack_dehradun_delhi_delhi%20ncr_faridabad_gandhinagar_ghaziabad_goa_greater%20noida_gujarat_guntur_gurgaon-gurugram_guwahati_haridwar_haryana_hosur_hubli_hyderabad_jaipur_jalandhar_jammu_jammu%20&%20kashmir_jamshedpur_jharkhand_jodhpur_karnataka_kerala_kolkata_lucknow_ludhiana_madurai_maharashtra_mp_mumbai_mysore_nagpur_nasik_navi%20mumbai_noida_odisha_panipat_patiala_patna_pondicherry_pune_punjab_raipur_rajasthan_rajkot_ranchi_sonipat_srinagar_surat_tamil%20nadu_telangana_thane_thiruvananthapuram_udaipur_up_uttarakhand_vadodara-baroda_varanasi-banaras_vijayawada_vishakhapatnam-vizag_warangal-13-87_88_53_45_34_79_3_65_19_14_6_64_70_84_86_58_36_1_40_55_41_13_39_8_77_37_12_57_16_71_72_4_11_46_43_42_63_20_52_31_17_5_60_48_83_9_10_2_73_66_67_68_38_18_50_47_61_85_7_15_74_33_80_62_49_44_54_32_35_69_75_51_21_59_56_81_76_78_82-0-0-1.html';
console.log('πŸ”„ Loading main page...');
await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
console.log('⏳ Waiting for content to load...');
await page.waitForTimeout(3000);
// Scroll to load more jobs
console.log('πŸ“œ Scrolling to load all jobs...');
let previousCount = 0;
let scrollAttempts = 0;
while (scrollAttempts < 15) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentCount = await page.$$eval('a[href*="/j/"]', links => links.length);
console.log(` Scroll ${scrollAttempts + 1}: Found ${currentCount} job links`);
if (currentCount === previousCount) {
console.log(' No new jobs loaded, stopping scroll');
break;
}
previousCount = currentCount;
scrollAttempts++;
}
// Get all job links
const jobLinks = await page.$$eval('a[href*="/j/"]', links =>
links.map(link => ({
href: link.getAttribute('href'),
title: link.textContent.replace(/[\n\r\t]+/g, ' ').replace(/\s+/g, ' ').trim()
}))
);
console.log(`\nβœ… Total job links found: ${jobLinks.length}`);
if (jobLinks.length === 0) {
console.log('❌ No job links found. Exiting...');
return;
}
// Process each job link
console.log('\nπŸ” Processing individual job pages...');
for (let i = 0; i < jobLinks.length; i++) {
const jobLink = jobLinks[i];
const fullUrl = `https://www.iimjobs.com${jobLink.href}`;
console.log(`\n--- Processing Job ${i + 1}/${jobLinks.length} ---`);
console.log(`URL: ${fullUrl}`);
try {
// Navigate to individual job page
await page.goto(fullUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForTimeout(2000);
// Extract detailed job information
const jobData = await page.evaluate(() => {
const data = {
url: window.location.href,
title: '',
company: '',
location: '',
experience: '',
salary: '',
datePosted: '',
jobType: '',
department: '',
reportsTo: '',
keyResponsibilities: '',
requirements: '',
skills: '',
education: '',
description: '',
tags: []
};
// Extract title - try multiple selectors
const titleSelectors = [
'h1',
'.job-title',
'[data-testid="job-title"]',
'.job-header h1',
'.position-title'
];
for (const selector of titleSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.title = element.textContent.trim();
break;
}
}
// Extract company name
const companySelectors = [
'.company-name',
'[data-testid="company-name"]',
'.employer-name',
'h2',
'.company-title'
];
for (const selector of companySelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.company = element.textContent.trim();
break;
}
}
// Extract location
const locationSelectors = [
'[data-testid="job_location"]',
'.job-location',
'.location',
'.job-details .location'
];
for (const selector of locationSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.location = element.textContent.trim();
break;
}
}
// Extract experience
const experienceSelectors = [
'[data-testid="job_experience"]',
'.experience',
'.job-experience',
'.years-experience'
];
for (const selector of experienceSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.experience = element.textContent.trim();
break;
}
}
// Extract salary
const salarySelectors = [
'.salary',
'.compensation',
'.pay-range',
'[data-testid="salary"]'
];
for (const selector of salarySelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.salary = element.textContent.trim();
break;
}
}
// Extract date posted
const dateSelectors = [
'[data-testid="date_posted"]',
'.date-posted',
'.posted-date',
'.job-date'
];
for (const selector of dateSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.datePosted = element.textContent.trim();
break;
}
}
// Extract "Reports to" information
const reportsToText = document.body.textContent;
const reportsToMatch = reportsToText.match(/Reports to:\s*([^\n\r]+)/i);
if (reportsToMatch) {
data.reportsTo = reportsToMatch[1].trim();
}
// Extract key responsibilities
const responsibilitiesSection = document.body.textContent;
const respMatch = responsibilitiesSection.match(/Key responsibilities?:\s*([\s\S]*?)(?=\n\n|\n[A-Z]|$)/i);
if (respMatch) {
data.keyResponsibilities = respMatch[1].trim().substring(0, 500); // Limit length
}
// Extract job description - look for common patterns
const descriptionSelectors = [
'.job-description',
'.description',
'.job-content',
'.job-details',
'#job-description'
];
for (const selector of descriptionSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent.trim()) {
data.description = element.textContent.trim().substring(0, 1000); // Limit length
break;
}
}
// Extract tags/hashtags
const tagElements = document.querySelectorAll('[href*="#"], .tag, .hashtag');
const tags = [];
tagElements.forEach(tag => {
const text = tag.textContent.trim();
if (text.startsWith('#') && text.length > 1) {
tags.push(text);
}
});
data.tags = tags.slice(0, 10); // Limit to 10 tags
// If we couldn't get title from selectors, try to extract from page text
if (!data.title) {
const h1Elements = document.querySelectorAll('h1');
if (h1Elements.length > 0) {
data.title = h1Elements[0].textContent.trim();
}
}
return data;
});
// Add to our data array
allJobsData.push(jobData);
console.log(`βœ… Title: ${jobData.title.substring(0, 60)}...`);
console.log(` Company: ${jobData.company}`);
console.log(` Location: ${jobData.location}`);
console.log(` Experience: ${jobData.experience}`);
// Small delay between requests
await page.waitForTimeout(1000);
} catch (error) {
console.log(`❌ Error processing job ${i + 1}: ${error.message}`);
// Add error entry to maintain consistency
allJobsData.push({
url: fullUrl,
title: jobLink.title,
company: 'Error',
location: 'Error',
experience: 'Error',
salary: 'Error',
datePosted: 'Error',
jobType: 'Error',
department: 'Error',
reportsTo: 'Error',
keyResponsibilities: 'Error',
requirements: 'Error',
skills: 'Error',
education: 'Error',
description: 'Error',
tags: []
});
}
}
// Generate CSV content
console.log('\nπŸ“Š Generating CSV file...');
const csvHeaders = [
'URL',
'Title',
'Company',
'Location',
'Experience',
'Salary',
'Date Posted',
'Job Type',
'Department',
'Reports To',
'Key Responsibilities',
'Requirements',
'Skills',
'Education',
'Description',
'Tags'
];
const csvRows = allJobsData.map(job => [
job.url,
`"${job.title.replace(/"/g, '""')}"`,
`"${job.company.replace(/"/g, '""')}"`,
`"${job.location.replace(/"/g, '""')}"`,
`"${job.experience.replace(/"/g, '""')}"`,
`"${job.salary.replace(/"/g, '""')}"`,
`"${job.datePosted.replace(/"/g, '""')}"`,
`"${job.jobType.replace(/"/g, '""')}"`,
`"${job.department.replace(/"/g, '""')}"`,
`"${job.reportsTo.replace(/"/g, '""')}"`,
`"${job.keyResponsibilities.replace(/"/g, '""')}"`,
`"${job.requirements.replace(/"/g, '""')}"`,
`"${job.skills.replace(/"/g, '""')}"`,
`"${job.education.replace(/"/g, '""')}"`,
`"${job.description.replace(/"/g, '""')}"`,
`"${job.tags.join(', ').replace(/"/g, '""')}"`
]);
const csvContent = [csvHeaders.join(','), ...csvRows.map(row => row.join(','))].join('\n');
// Save CSV file
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').split('T')[0];
const filename = `iimjobs_banking_finance_${timestamp}.csv`;
fs.writeFileSync(filename, csvContent, 'utf8');
console.log(`\nβœ… CSV file saved: ${filename}`);
console.log(`πŸ“ˆ Total jobs processed: ${allJobsData.length}`);
// Summary statistics
const successfulJobs = allJobsData.filter(job => job.company !== 'Error').length;
const errorJobs = allJobsData.length - successfulJobs;
console.log(`\nπŸ“Š Summary:`);
console.log(`βœ… Successfully processed: ${successfulJobs}`);
console.log(`❌ Errors: ${errorJobs}`);
console.log(`πŸ“ File location: ${path.resolve(filename)}`);
} catch (error) {
console.error('❌ Main error:', error);
} finally {
await browser.close();
}
}
// Run the scraper
scrapeJobsToCSV().catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment