Last active
August 29, 2015 14:06
-
-
Save markjgap/1513dbaf52b6d666b425 to your computer and use it in GitHub Desktop.
Written in Python, program scraps data from multiple webs pages, searches a pattern of strings using regular expression and makes necessary modifications so data can be alphabetize.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re # re imports regular expressions | |
# creates a file called job_list and dumps data. | |
os.system('lynx -nolist -dump http://www.careercast.com/jobs-rated/jobs-rated-2014-ranking-200-jobs-best-worst > job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-21-40 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-41-60 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-61-80 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-81-100 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-101-120 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-121-140 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-141-160 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-161-180 >> job_list.txt') | |
os.system('lynx -nolist -dump http://www.careercast.com/content/top-200-jobs-2014-181-200 >> job_list.txt') | |
textfile = open('job_list.txt', 'r') | |
lines = textfile.readlines() | |
alpha_job = [] # new list will be alphbetized | |
# Using regular expressions, finds all strings of text that fits a certain pattern. | |
# Finds any number of digits between 1 and 3 followed by a period (.), white space and | |
# at least one or more letters including more white space, special characters, or more letters | |
# i.e. 100. The String! | |
for ranked_job in lines: | |
m = (re.findall(r'\d{1,3}\. \w.+', ranked_job)) | |
if m: | |
# splits up original sequence and modifies string | |
alpha_job.append(m[0].split(". ")[1] + " " + m[0].split(". ")[0] + " \n") | |
# alphabetizes final list | |
alpha_job.sort() | |
# writes to file the new alphabetized jobs list | |
textfile = open('job_list.txt', 'w') | |
textfile.writelines(alpha_job_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment