Created
March 9, 2014 20:40
-
-
Save xitij2000/9454233 to your computer and use it in GitHub Desktop.
A script to download videos from EdX using aria2 and automatically putting them in a clean folder hierarchy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A script that can automatically download videos from Edx | |
# Currently this is heavily tied to the way my Edx account and my computer is | |
# set up. It downloads by sending the the download url and download directory | |
# to aria2 runnig in rpc mode. | |
# More info here: http://aria2.sourceforge.net/manual/en/html/aria2c.html#rpc-interface | |
# You can use http://ziahamza.github.io/webui-aria2/ to see download progress | |
# For now parameters, such as username, password, and which course to download | |
# can be provided in the script | |
# I intend to make it more flexible | |
from __future__ import print_function | |
import xmlrpclib | |
import requests | |
from bs4 import BeautifulSoup | |
class Edx(object): | |
BASE_URL = 'https://courses.edx.org' | |
LOGIN_URL = BASE_URL + '/login_ajax' | |
DASHBOARD_URL = BASE_URL + '/dashboard' | |
def __init__(self, email, password): | |
self.__session = requests.Session() | |
s = self.__session | |
token = s.get(self.LOGIN_URL).cookies['csrftoken'] | |
r = s.post(self.LOGIN_URL, data={ | |
'email': email, | |
'password': password, | |
'remember': False, | |
}, headers={ | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24' | |
'.0) Gecko/20100101 Firefox/24.0', | |
'Accept': 'application/json, text/javascript, */*; q=0.01', | |
'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', | |
'Referer': self.LOGIN_URL, | |
'X-Requested-With': 'XMLHttpRequest', | |
'X-CSRFToken': token, | |
}) | |
self.soup = BeautifulSoup(s.get(self.DASHBOARD_URL).text) | |
self.courses = {} | |
try: | |
self.courses = pickle.load(open('data.p', 'rb')) | |
except IOError as e: | |
print("Cached data not available, downloading from scratch") | |
def get_courses(self): | |
if self.courses == {}: | |
print("Loading Course Data") | |
for course in self.soup.find_all('li', class_='course-item'): | |
title = course.find('h3').text.strip() | |
url = self.BASE_URL + course.article.a['href'].strip() | |
self.courses[title] = {'url': url} | |
for course in self.courses: | |
print("Loading info for Course: %s" % course) | |
self.get_course_info(course) | |
pickle.dump(self.courses, open('data.p', 'wb')) | |
return self.courses | |
def get_course_info(self, course): | |
course_data = self.courses.get(course) | |
if 'chapters' in course_data: | |
return course_data['chapters'] | |
s = self.__session | |
r = s.get(course_data['url'].replace('info', 'courseware')) | |
soup = BeautifulSoup(r.text) | |
chapters = {} | |
for chapter in soup.find_all('div', class_='chapter'): | |
title = chapter.h3.a.text.strip() | |
url = self.BASE_URL + chapter.ul.li.a['href'] | |
chapters[title] = {'url': url, 'sections': {}} | |
for section in chapter.find_all('li'): | |
section_title = section.a.p.text.strip() | |
section_url = self.BASE_URL + section.a['href'] | |
chapters[title]['sections'][section_title] = { | |
'url': section_url} | |
course_data['chapters'] = chapters | |
for chapter in chapters: | |
self.get_chapter_info(course, chapter) | |
return chapters | |
def get_chapter_info(self, course, chapter): | |
course_data = self.courses.get(course) | |
chapter_data = course_data['chapters'][chapter] | |
for section in chapter_data['sections']: | |
self.get_section_info(course, chapter, section) | |
return chapter_data | |
#chapter_url = chapter.get('url') | |
def get_section_info(self, course, chapter, section): | |
course_data = self.courses.get(course) | |
chapter_data = course_data['chapters'][chapter] | |
section_data = chapter_data['sections'][section] | |
s = self.__session | |
r = s.get(section_data['url']) | |
soup = BeautifulSoup(r.text) | |
videos = [] | |
for content in soup.find_all('div', class_='seq_contents'): | |
resoup = BeautifulSoup(content.text) | |
for video in resoup.find_all('li', class_='video-sources'): | |
videos.append(video.a['href']) | |
section_data['videos'] = videos | |
return section_data | |
def download_all_from_section(self, course, chapter, section): | |
course_data = self.courses.get(course) | |
chapter_data = course_data['chapters'][chapter] | |
section_data = chapter_data['sections'][section] | |
for i, video in enumerate(section_data['videos']): | |
s = xmlrpclib.ServerProxy("http://localhost:6800/rpc") | |
s.aria2.addUri([video], {'dir': '~/Studies/EdX/' | |
+ course + '/' | |
+ chapter + '/' | |
+ section + '/' | |
+ str(i) + '/'}) | |
def download_all_from_chapter(self, course, chapter): | |
course_data = self.courses.get(course) | |
chapter_data = course_data['chapters'][chapter] | |
for section in chapter_data['sections']: | |
self.download_all_from_section(course, chapter, section) | |
def download_all_from_course(self, course): | |
course_data = self.courses.get(course) | |
for chapter in course_data['chapters']: | |
self.download_all_from_chapter(course, chapter) | |
def download_all(self): | |
for course in self.courses: | |
self.download_all_from_course(course) | |
import cPickle as pickle | |
e = Edx('email', 'password') | |
e.get_courses() | |
# Full course name, e.g. Louv1.01x Paradigms of Computer Programming | |
e.download_all_from_course('full course name') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, does this code support the download of slides? Thanks.