Last active
February 3, 2019 17:40
-
-
Save edison12a/7dccdb78a9979807d13a6c661a868332 to your computer and use it in GitHub Desktop.
Code For: https://medium.com/@simicode/how-to-mine-uce-results-of-the-whole-country-using-python-f72092ceaf6d
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# if you want to pull data from an API, requests is your friend | |
import requests | |
# BeautifulSoup helps us extract data from html, xml, .. | |
from bs4 import BeautifulSoup | |
# this is the type of strings extracted from html | |
from bs4.element import NavigableString | |
ug_results = [] | |
# set these to a big number like 1000, 10000, any number that makes sense as an index number | |
schs= 10 # this is the number of schools to loop over | |
stds = 100 # this is the assumed number of students to loop over | |
# loop over possible school index numbers | |
for sch in range(1, schs): | |
# loop over possible student index numbers from that school | |
for std in range(1, stds): | |
# use a try to ignore the errors that result out of invalid index numbers | |
try: | |
# prepend zeros to the numbers and slice them to a sensibe number of characters | |
sch = ('000'+str(sch) )[-4::] | |
std = ('00'+str(std) )[-3::] | |
# use an f-string to add the numbers to the posts data format you got from the curl request | |
data = f"index_no=u{sch}%2F{std}" | |
url = "https://ereg.uneb.ac.ug/ajax_calls/results_status" | |
# make the post request and store it at this variable | |
response = requests.post( | |
url, | |
headers = {"accept":"*/*", | |
"content-type":"application/x-www-form-urlencoded; charset=UTF-8", | |
"x-requested-with":"XMLHttpRequest", | |
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' | |
}, | |
data = data | |
) | |
# extract the html part from the response object | |
response = response.text | |
# print(response) | |
# parse the html using BS4 | |
soup = BeautifulSoup(response, 'html.parser') | |
# get soup from the div that contains results | |
results = soup.find(class_='col-md-10') | |
# print(results.text) | |
# once you get the results html, this is where your creativity comes into play | |
# this is how i went about it | |
# define a dict to store new results | |
student_results = {} | |
# this generator helps extract strings out of an element that has <br>s | |
# i got it from stackoverflow! | |
for result_str in results.childGenerator(): | |
# this is the type of strings found in html | |
if type(result_str) is NavigableString: | |
# strip it of spaces | |
result_str = str(result_str).strip() | |
# i found out that strings with a subject and grade have a length of 7 | |
if len(result_str) == 7: | |
# split a tring like "ENG : 2" into two parts | |
split_result = result_str.split(' : ') | |
# add it to this student's result dic | |
student_results[split_result[0]]=split_result[1] | |
print(student_results) # {'ENG': '4', 'LIT': '7', 'HIS': '4', 'GEO': '5', 'MAT': '6', 'PHY': '6', 'CHE': '7', 'BIO': '6', 'COM': '6', 'CST': '7'} | |
# add results to list | |
ug_results.append(student_results) | |
except Exception as e: | |
# print(sch, std, data) | |
print(str(e)) | |
pass | |
# do what you want with your results, Have fun! | |
print(ug_results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment