Created
May 29, 2025 10:04
-
-
Save mokshchadha/c56a68506ada2c89018a5701c990c71d to your computer and use it in GitHub Desktop.
Scrape IIM jobs using playwright and JS - May 2025
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Enhanced job scraper that opens individual job links and scrapes detailed data | |
const { chromium } = require('playwright'); | |
const fs = require('fs'); | |
const path = require('path'); | |
async function scrapeJobsToCSV() { | |
const browser = await chromium.launch({ headless: false }); | |
const page = await browser.newPage(); | |
// Array to store all job data | |
const allJobsData = []; | |
try { | |
// Test with banking finance URL | |
const testUrl = 'https://www.iimjobs.com/c/filter/banking-finance-jobs-in-metros_anywhere%20in%20india_ahmedabad_amritsar_andhra%20pradesh_aurangabad_bangalore_bhubaneshwar_bihar_chandigarh_chennai_chhattisgarh_cochin-kochi_coimbatore_cuttack_dehradun_delhi_delhi%20ncr_faridabad_gandhinagar_ghaziabad_goa_greater%20noida_gujarat_guntur_gurgaon-gurugram_guwahati_haridwar_haryana_hosur_hubli_hyderabad_jaipur_jalandhar_jammu_jammu%20&%20kashmir_jamshedpur_jharkhand_jodhpur_karnataka_kerala_kolkata_lucknow_ludhiana_madurai_maharashtra_mp_mumbai_mysore_nagpur_nasik_navi%20mumbai_noida_odisha_panipat_patiala_patna_pondicherry_pune_punjab_raipur_rajasthan_rajkot_ranchi_sonipat_srinagar_surat_tamil%20nadu_telangana_thane_thiruvananthapuram_udaipur_up_uttarakhand_vadodara-baroda_varanasi-banaras_vijayawada_vishakhapatnam-vizag_warangal-13-87_88_53_45_34_79_3_65_19_14_6_64_70_84_86_58_36_1_40_55_41_13_39_8_77_37_12_57_16_71_72_4_11_46_43_42_63_20_52_31_17_5_60_48_83_9_10_2_73_66_67_68_38_18_50_47_61_85_7_15_74_33_80_62_49_44_54_32_35_69_75_51_21_59_56_81_76_78_82-0-0-1.html'; | |
console.log('π Loading main page...'); | |
await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); | |
console.log('β³ Waiting for content to load...'); | |
await page.waitForTimeout(3000); | |
// Scroll to load more jobs | |
console.log('π Scrolling to load all jobs...'); | |
let previousCount = 0; | |
let scrollAttempts = 0; | |
while (scrollAttempts < 15) { | |
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); | |
await page.waitForTimeout(2000); | |
const currentCount = await page.$$eval('a[href*="/j/"]', links => links.length); | |
console.log(` Scroll ${scrollAttempts + 1}: Found ${currentCount} job links`); | |
if (currentCount === previousCount) { | |
console.log(' No new jobs loaded, stopping scroll'); | |
break; | |
} | |
previousCount = currentCount; | |
scrollAttempts++; | |
} | |
// Get all job links | |
const jobLinks = await page.$$eval('a[href*="/j/"]', links => | |
links.map(link => ({ | |
href: link.getAttribute('href'), | |
title: link.textContent.replace(/[\n\r\t]+/g, ' ').replace(/\s+/g, ' ').trim() | |
})) | |
); | |
console.log(`\nβ Total job links found: ${jobLinks.length}`); | |
if (jobLinks.length === 0) { | |
console.log('β No job links found. Exiting...'); | |
return; | |
} | |
// Process each job link | |
console.log('\nπ Processing individual job pages...'); | |
for (let i = 0; i < jobLinks.length; i++) { | |
const jobLink = jobLinks[i]; | |
const fullUrl = `https://www.iimjobs.com${jobLink.href}`; | |
console.log(`\n--- Processing Job ${i + 1}/${jobLinks.length} ---`); | |
console.log(`URL: ${fullUrl}`); | |
try { | |
// Navigate to individual job page | |
await page.goto(fullUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); | |
await page.waitForTimeout(2000); | |
// Extract detailed job information | |
const jobData = await page.evaluate(() => { | |
const data = { | |
url: window.location.href, | |
title: '', | |
company: '', | |
location: '', | |
experience: '', | |
salary: '', | |
datePosted: '', | |
jobType: '', | |
department: '', | |
reportsTo: '', | |
keyResponsibilities: '', | |
requirements: '', | |
skills: '', | |
education: '', | |
description: '', | |
tags: [] | |
}; | |
// Extract title - try multiple selectors | |
const titleSelectors = [ | |
'h1', | |
'.job-title', | |
'[data-testid="job-title"]', | |
'.job-header h1', | |
'.position-title' | |
]; | |
for (const selector of titleSelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.title = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract company name | |
const companySelectors = [ | |
'.company-name', | |
'[data-testid="company-name"]', | |
'.employer-name', | |
'h2', | |
'.company-title' | |
]; | |
for (const selector of companySelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.company = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract location | |
const locationSelectors = [ | |
'[data-testid="job_location"]', | |
'.job-location', | |
'.location', | |
'.job-details .location' | |
]; | |
for (const selector of locationSelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.location = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract experience | |
const experienceSelectors = [ | |
'[data-testid="job_experience"]', | |
'.experience', | |
'.job-experience', | |
'.years-experience' | |
]; | |
for (const selector of experienceSelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.experience = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract salary | |
const salarySelectors = [ | |
'.salary', | |
'.compensation', | |
'.pay-range', | |
'[data-testid="salary"]' | |
]; | |
for (const selector of salarySelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.salary = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract date posted | |
const dateSelectors = [ | |
'[data-testid="date_posted"]', | |
'.date-posted', | |
'.posted-date', | |
'.job-date' | |
]; | |
for (const selector of dateSelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.datePosted = element.textContent.trim(); | |
break; | |
} | |
} | |
// Extract "Reports to" information | |
const reportsToText = document.body.textContent; | |
const reportsToMatch = reportsToText.match(/Reports to:\s*([^\n\r]+)/i); | |
if (reportsToMatch) { | |
data.reportsTo = reportsToMatch[1].trim(); | |
} | |
// Extract key responsibilities | |
const responsibilitiesSection = document.body.textContent; | |
const respMatch = responsibilitiesSection.match(/Key responsibilities?:\s*([\s\S]*?)(?=\n\n|\n[A-Z]|$)/i); | |
if (respMatch) { | |
data.keyResponsibilities = respMatch[1].trim().substring(0, 500); // Limit length | |
} | |
// Extract job description - look for common patterns | |
const descriptionSelectors = [ | |
'.job-description', | |
'.description', | |
'.job-content', | |
'.job-details', | |
'#job-description' | |
]; | |
for (const selector of descriptionSelectors) { | |
const element = document.querySelector(selector); | |
if (element && element.textContent.trim()) { | |
data.description = element.textContent.trim().substring(0, 1000); // Limit length | |
break; | |
} | |
} | |
// Extract tags/hashtags | |
const tagElements = document.querySelectorAll('[href*="#"], .tag, .hashtag'); | |
const tags = []; | |
tagElements.forEach(tag => { | |
const text = tag.textContent.trim(); | |
if (text.startsWith('#') && text.length > 1) { | |
tags.push(text); | |
} | |
}); | |
data.tags = tags.slice(0, 10); // Limit to 10 tags | |
// If we couldn't get title from selectors, try to extract from page text | |
if (!data.title) { | |
const h1Elements = document.querySelectorAll('h1'); | |
if (h1Elements.length > 0) { | |
data.title = h1Elements[0].textContent.trim(); | |
} | |
} | |
return data; | |
}); | |
// Add to our data array | |
allJobsData.push(jobData); | |
console.log(`β Title: ${jobData.title.substring(0, 60)}...`); | |
console.log(` Company: ${jobData.company}`); | |
console.log(` Location: ${jobData.location}`); | |
console.log(` Experience: ${jobData.experience}`); | |
// Small delay between requests | |
await page.waitForTimeout(1000); | |
} catch (error) { | |
console.log(`β Error processing job ${i + 1}: ${error.message}`); | |
// Add error entry to maintain consistency | |
allJobsData.push({ | |
url: fullUrl, | |
title: jobLink.title, | |
company: 'Error', | |
location: 'Error', | |
experience: 'Error', | |
salary: 'Error', | |
datePosted: 'Error', | |
jobType: 'Error', | |
department: 'Error', | |
reportsTo: 'Error', | |
keyResponsibilities: 'Error', | |
requirements: 'Error', | |
skills: 'Error', | |
education: 'Error', | |
description: 'Error', | |
tags: [] | |
}); | |
} | |
} | |
// Generate CSV content | |
console.log('\nπ Generating CSV file...'); | |
const csvHeaders = [ | |
'URL', | |
'Title', | |
'Company', | |
'Location', | |
'Experience', | |
'Salary', | |
'Date Posted', | |
'Job Type', | |
'Department', | |
'Reports To', | |
'Key Responsibilities', | |
'Requirements', | |
'Skills', | |
'Education', | |
'Description', | |
'Tags' | |
]; | |
const csvRows = allJobsData.map(job => [ | |
job.url, | |
`"${job.title.replace(/"/g, '""')}"`, | |
`"${job.company.replace(/"/g, '""')}"`, | |
`"${job.location.replace(/"/g, '""')}"`, | |
`"${job.experience.replace(/"/g, '""')}"`, | |
`"${job.salary.replace(/"/g, '""')}"`, | |
`"${job.datePosted.replace(/"/g, '""')}"`, | |
`"${job.jobType.replace(/"/g, '""')}"`, | |
`"${job.department.replace(/"/g, '""')}"`, | |
`"${job.reportsTo.replace(/"/g, '""')}"`, | |
`"${job.keyResponsibilities.replace(/"/g, '""')}"`, | |
`"${job.requirements.replace(/"/g, '""')}"`, | |
`"${job.skills.replace(/"/g, '""')}"`, | |
`"${job.education.replace(/"/g, '""')}"`, | |
`"${job.description.replace(/"/g, '""')}"`, | |
`"${job.tags.join(', ').replace(/"/g, '""')}"` | |
]); | |
const csvContent = [csvHeaders.join(','), ...csvRows.map(row => row.join(','))].join('\n'); | |
// Save CSV file | |
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').split('T')[0]; | |
const filename = `iimjobs_banking_finance_${timestamp}.csv`; | |
fs.writeFileSync(filename, csvContent, 'utf8'); | |
console.log(`\nβ CSV file saved: ${filename}`); | |
console.log(`π Total jobs processed: ${allJobsData.length}`); | |
// Summary statistics | |
const successfulJobs = allJobsData.filter(job => job.company !== 'Error').length; | |
const errorJobs = allJobsData.length - successfulJobs; | |
console.log(`\nπ Summary:`); | |
console.log(`β Successfully processed: ${successfulJobs}`); | |
console.log(`β Errors: ${errorJobs}`); | |
console.log(`π File location: ${path.resolve(filename)}`); | |
} catch (error) { | |
console.error('β Main error:', error); | |
} finally { | |
await browser.close(); | |
} | |
} | |
// Run the scraper | |
scrapeJobsToCSV().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment