Last active
December 29, 2015 05:09
-
-
Save verespej/7619547 to your computer and use it in GitHub Desktop.
Scrape course description from UW CSE course page and store as JSON. Implemented using node.js.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var cheerio = require('cheerio'); | |
var http = require('http'); | |
var fs = require('fs'); | |
var url = require('url'); | |
var pageUrl = 'http://www.washington.edu/students/crscat/cse.html'; | |
http.get(pageUrl, function(res) { | |
res.setEncoding('utf8'); | |
var data = ''; | |
res.on('data', function(chunk) { | |
data += chunk; | |
}); | |
res.on('end', function() { | |
parseCoursePage(cheerio.load(data), pageUrl); | |
}); | |
}); | |
function parseCoursePage($, rootUrl) { | |
var entries = []; | |
$('a[name]').each(function(index, elm) { | |
var courseId = $(elm).text(); | |
var courseName = $(elm).parent().text().substring(courseId.length); | |
var courseDescription = $(elm).parent().parent().text().substring( | |
courseId.length + courseName.length | |
); | |
var credits = 'unspecified'; | |
var areasOfKnowledge = []; | |
var creditsStart = courseName.indexOf('('); | |
var creditsEnd = courseName.indexOf(')'); | |
if (creditsStart >= 0 && creditsEnd >= 0 && creditsStart < creditsEnd) { | |
// Get the credits | |
credits = courseName.substring(creditsStart + 1, creditsEnd); | |
// Get the areas of knowledge | |
if (courseName.length >= creditsEnd + 1) { | |
var aokText = courseName.substring(creditsEnd + 1); | |
var aokTokens = aokText.split(','); | |
for (var i = 0; i < aokTokens.length; i++) { | |
var code = aokTokens[i].trim(); | |
if (code.length > 0) { | |
areasOfKnowledge.push(code); | |
} | |
} | |
} | |
// Isolate course name | |
courseName = courseName.substring(0, creditsStart - 1); | |
} | |
// Get instructor links | |
var courseLinks = []; | |
$(elm).parent().parent().children('a').each(function(linkIndex, linkObj) { | |
var href = $(linkObj).attr('href'); | |
if (href != null && href.length > 0) { | |
courseLinks.push(url.resolve(rootUrl, href.trim())); | |
} | |
}); | |
// Get prerequisites | |
var prereqs = 'None'; | |
var parsedPr = /Prerequisite:(.*?)\.(?!\d)/.exec(courseDescription); | |
if (parsedPr != null) { | |
prereqs = parsedPr[1].trim(); | |
} | |
console.log(prereqs); | |
entries.push({ | |
id: courseId.trim(), | |
name: courseName.trim(), | |
credits: credits.trim(), | |
aok: areasOfKnowledge, | |
description: courseDescription.trim(), | |
links: courseLinks, | |
prereqs: prereqs | |
}); | |
}); | |
saveResults(entries, './courses.json'); | |
} | |
function saveResults(results, dest) { | |
console.log('Saving results to ' + dest); | |
var stream = fs.createWriteStream(dest, { encoding: 'utf8' }); | |
stream.write(JSON.stringify(results, null, 2)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "uw-cse-course-scraper", | |
"description": "Scrape UW CSE courses and put them in a structured format", | |
"author": "Hakon Verespej <[email protected]>", | |
"dependencies": { | |
"cheerio": "0.12.4" | |
}, | |
"devDependencies": { | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
UW CSE Course Scraper
This code contains node.js javascript to scrape and parse the UW CSE course content provided at http://www.washington.edu/students/crscat/cse.html. The result is a json document containing the parsed content in a structured format.
To use,