Created
July 25, 2018 13:27
-
-
Save aljiwala/aa805a1b3da090ecdd91dfb9ab519d30 to your computer and use it in GitHub Desktop.
Extract courses from http://education-india.in
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from time import sleep | |
from bs4 import BeautifulSoup | |
def extract_from_ed_india(): | |
final = list() | |
pages = range(1, 32) | |
for i in pages: | |
def get_soup(url): | |
resp = requests.get(url) | |
return BeautifulSoup(resp.text, 'html.parser') | |
base_url = 'http://education-india.in/Education/Courses/' | |
soup = get_soup('{}?PageNumber={}'.format(base_url, i)) | |
rows = soup.find_all('table')[2].findAll('tr') | |
for row in rows: | |
if row.th: | |
continue | |
d = dict() | |
td_list = row.find_all('td') | |
for index, td in enumerate(td_list): | |
stripped = td.text.strip() | |
in_ignore = ('Total Record', 'Showing Page No', '[First] [Prev]') | |
if stripped.startswith(in_ignore): | |
continue | |
if index == 0: | |
d['sr_no'] = stripped | |
elif index == 1: | |
d['course'] = stripped | |
href = td.a.get('href', '') | |
d['course_href'] = href | |
if href: | |
course_url = base_url + href | |
course_soup = get_soup(course_url) | |
th = course_soup.find('table', class_='detail').th.text.strip() | |
d['course_sn'] = th.replace(stripped, '').replace(' Details', '')[1:-1] | |
elif index == 2: | |
d['duration'] = stripped | |
elif index == 3: | |
d['eligibility'] = stripped | |
else: | |
pass | |
if d: | |
print('sr_no: {}, extracted.'.format(d['sr_no'])) | |
final.append(d) | |
sleep(2) | |
print() | |
print() | |
print() | |
print(final) | |
def main(): | |
extract_from_ed_india() | |
exit() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment