Skip to content

Instantly share code, notes, and snippets.

@xitij2000
Created March 9, 2014 20:40
Show Gist options
  • Save xitij2000/9454233 to your computer and use it in GitHub Desktop.
Save xitij2000/9454233 to your computer and use it in GitHub Desktop.
A script to download videos from EdX using aria2 and automatically putting them in a clean folder hierarchy.
# A script that can automatically download videos from Edx
# Currently this is heavily tied to the way my Edx account and my computer is
# set up. It downloads by sending the the download url and download directory
# to aria2 runnig in rpc mode.
# More info here: http://aria2.sourceforge.net/manual/en/html/aria2c.html#rpc-interface
# You can use http://ziahamza.github.io/webui-aria2/ to see download progress
# For now parameters, such as username, password, and which course to download
# can be provided in the script
# I intend to make it more flexible
from __future__ import print_function
import xmlrpclib
import requests
from bs4 import BeautifulSoup
class Edx(object):
BASE_URL = 'https://courses.edx.org'
LOGIN_URL = BASE_URL + '/login_ajax'
DASHBOARD_URL = BASE_URL + '/dashboard'
def __init__(self, email, password):
self.__session = requests.Session()
s = self.__session
token = s.get(self.LOGIN_URL).cookies['csrftoken']
r = s.post(self.LOGIN_URL, data={
'email': email,
'password': password,
'remember': False,
}, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24'
'.0) Gecko/20100101 Firefox/24.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
'Referer': self.LOGIN_URL,
'X-Requested-With': 'XMLHttpRequest',
'X-CSRFToken': token,
})
self.soup = BeautifulSoup(s.get(self.DASHBOARD_URL).text)
self.courses = {}
try:
self.courses = pickle.load(open('data.p', 'rb'))
except IOError as e:
print("Cached data not available, downloading from scratch")
def get_courses(self):
if self.courses == {}:
print("Loading Course Data")
for course in self.soup.find_all('li', class_='course-item'):
title = course.find('h3').text.strip()
url = self.BASE_URL + course.article.a['href'].strip()
self.courses[title] = {'url': url}
for course in self.courses:
print("Loading info for Course: %s" % course)
self.get_course_info(course)
pickle.dump(self.courses, open('data.p', 'wb'))
return self.courses
def get_course_info(self, course):
course_data = self.courses.get(course)
if 'chapters' in course_data:
return course_data['chapters']
s = self.__session
r = s.get(course_data['url'].replace('info', 'courseware'))
soup = BeautifulSoup(r.text)
chapters = {}
for chapter in soup.find_all('div', class_='chapter'):
title = chapter.h3.a.text.strip()
url = self.BASE_URL + chapter.ul.li.a['href']
chapters[title] = {'url': url, 'sections': {}}
for section in chapter.find_all('li'):
section_title = section.a.p.text.strip()
section_url = self.BASE_URL + section.a['href']
chapters[title]['sections'][section_title] = {
'url': section_url}
course_data['chapters'] = chapters
for chapter in chapters:
self.get_chapter_info(course, chapter)
return chapters
def get_chapter_info(self, course, chapter):
course_data = self.courses.get(course)
chapter_data = course_data['chapters'][chapter]
for section in chapter_data['sections']:
self.get_section_info(course, chapter, section)
return chapter_data
#chapter_url = chapter.get('url')
def get_section_info(self, course, chapter, section):
course_data = self.courses.get(course)
chapter_data = course_data['chapters'][chapter]
section_data = chapter_data['sections'][section]
s = self.__session
r = s.get(section_data['url'])
soup = BeautifulSoup(r.text)
videos = []
for content in soup.find_all('div', class_='seq_contents'):
resoup = BeautifulSoup(content.text)
for video in resoup.find_all('li', class_='video-sources'):
videos.append(video.a['href'])
section_data['videos'] = videos
return section_data
def download_all_from_section(self, course, chapter, section):
course_data = self.courses.get(course)
chapter_data = course_data['chapters'][chapter]
section_data = chapter_data['sections'][section]
for i, video in enumerate(section_data['videos']):
s = xmlrpclib.ServerProxy("http://localhost:6800/rpc")
s.aria2.addUri([video], {'dir': '~/Studies/EdX/'
+ course + '/'
+ chapter + '/'
+ section + '/'
+ str(i) + '/'})
def download_all_from_chapter(self, course, chapter):
course_data = self.courses.get(course)
chapter_data = course_data['chapters'][chapter]
for section in chapter_data['sections']:
self.download_all_from_section(course, chapter, section)
def download_all_from_course(self, course):
course_data = self.courses.get(course)
for chapter in course_data['chapters']:
self.download_all_from_chapter(course, chapter)
def download_all(self):
for course in self.courses:
self.download_all_from_course(course)
import cPickle as pickle
e = Edx('email', 'password')
e.get_courses()
# Full course name, e.g. Louv1.01x Paradigms of Computer Programming
e.download_all_from_course('full course name')
@nittanygabo
Copy link

Hi, does this code support the download of slides? Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment