Created
November 24, 2017 15:30
-
-
Save kartoch/6935ab592eed72e623263d10227f9c5d to your computer and use it in GitHub Desktop.
Python @ Polytech'Lille - TP2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Goal: scrap the "Annuaire" web page of Polytech Lille to gather the list of | |
# all students and their id (encoded in base 64). Print the number of | |
# entry found | |
# | |
# You can use requests for scraping and re for extracting data from the page | |
import base64 | |
import requests | |
import re | |
FILTER = '<a href=\'annuaire.php\?a=([\w=]+)\'>([^&]+)\ ([^<]+)<' | |
annuaire = {} | |
if __name__ == '__main__': | |
for c in range(ord('a'),ord('z')+1): | |
r = requests.post('http://www.polytech-lille.fr/annuaire.php', | |
data={'nom':chr(c)}) | |
groups = re.findall(FILTER, r.text) | |
for i in groups: | |
code = int(str(base64.b64decode(i[0]),'utf-8').partition('*')[0]) | |
firstname = i[1] | |
lastname = i[2] | |
if code not in annuaire: | |
annuaire[code] = (firstname,lastname) | |
print(chr(c) + " loaded") | |
print("number of entries: %i" % len(annuaire)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment