Created
July 30, 2014 13:46
-
-
Save bnmnetp/82e0e287a09580431a59 to your computer and use it in GitHub Desktop.
interactivepython database cleanup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:962bc8e068d054a027f1195730bba2115cb820266aabc57ae123f73d5294650f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Data Cleanup Notebook\n", | |
"\n", | |
"Some thoughts before I get started with this. \n", | |
" \n", | |
" * Start by removing short term users. Recalculate for repeatability\n", | |
" \n", | |
" * Remove blank SIDs, instructors, and test courses (gatech,devcourse, and lightly used courses) \n", | |
" \n", | |
" * Remove nontraditional pages (those not begining with courselib or runestone \n", | |
" \n", | |
" * Remove unofficial div_ids\n", | |
" \n", | |
" * Remove outliers. Recalculate them for repeatability\n", | |
" \n", | |
" * Replace runestone with courselib\n", | |
" \n", | |
" * Replace mChoice with mchoice for consistency\n", | |
" \n", | |
" \n", | |
"THINGS YOU NEED BEFORE RUNNING THIS NOTEBOOK:\n", | |
"\n", | |
"Tables:\n", | |
" * useinfo (or something similar)\n", | |
" * courses\n", | |
" \n", | |
"Python Scripts:\n", | |
" * bookParse.py\n", | |
" * getDivs.py\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import getDivs\n", | |
"import bookParse as bp\n", | |
"from getInstructors import getInstructors\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"import psycopg2\n", | |
"import socket\n", | |
"import numpy as np\n", | |
"import pandas.io.sql as psql\n", | |
"from pandas import DataFrame\n", | |
"from sqlalchemy import create_engine\n", | |
"\n", | |
"db_to_clean = 'bmiller'\n", | |
"db_user = 'bmiller'\n", | |
"db_pass = ''\n", | |
"\n", | |
"if db_pass:\n", | |
" engine2 = create_engine('postgresql+psycopg2://%s:%s@localhost/%s' % (db_user, db_pass, db_to_clean))\n", | |
" conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user,password=db_pass)\n", | |
"else:\n", | |
" engine2 = create_engine('postgresql+psycopg2://%s@localhost/%s' % (db_user, db_to_clean))\n", | |
" conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user)\n", | |
"\n", | |
"connection2 = engine2.connect()\n", | |
"\n", | |
"\n", | |
"%load_ext sql \n", | |
"%sql postgresql://bmiller@localhost/bmiller" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"The sql extension is already loaded. To reload it, use:\n", | |
" %reload_ext sql\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 47, | |
"text": [ | |
"'Connected: bmiller@bmiller'" | |
] | |
} | |
], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Remove non-traditional pages, blank sids, and instructors" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Before running the following query, make sure to insert the most recent list of instructors." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE (event = 'page' AND NOT (div_id ~ '^/(courselib|runestone).*')) --#non-traditional pages\n", | |
"OR sid = '' --#blank sids\n", | |
"OR sid IN ('DaveCousins', '01418116', 'sbcmharrison', 'mrcpittway', 'paolobianconi', 'OrganizeFISH', 'brekke', 'haugrudm', 'fnurl', 'Martin Highmore', 'gisc', 'jstephens', 'pguse', 'klaidley', 'Nobula', 'ivipul', 'dgyanek', 'Nicholas Seward', 'asahny', 'chorse', 'rschulz', 'nedoluzhko', 'sara', 'memedu', 'swbcteach', 'bula', 'pbeens', 'wmp', 'potterwalt', 'walkermsum', 'acsizmadia', 'the.krup', 'the_bob_student', 'the_bob', 'jgreenawalt', 'Mcintoshchs', 'gkondrak', 'a.reid', 'mckeownp', 'timbellnz', 'quintin', 'rahmani', 'tiverson', 'gccktaira', 'vtn2', 'bmiller', 'rosew', 'jbschafer', 'keen', '[email protected]', 'as2653', 'RKumar', 'dcaswell', 'dzolzer', 'gvrinten', 'minsu', 'mzmacky2', 'ktaira', 'rkowaney', 'WarrenPettitt', 'tompkins', 'tompkinsJack', 'HullSmurf001', 'tagliarinig', 'CSC131tompkins', 'vellinga', 'pearcej', 'rambasnet', 'mdogy', 'cmurakami', 'ficekr', 'ficek', 'wilf.lendrum', 'shlomor', 'jazcap', 'dlovell', 'alter718', 'polarlemniscate', 'acbart', 'pmisterovich', 'CheeseTurtle', 'mr.octavio', 'mirita81', 'shyampurk', 'robertcduvall', 'ludaesch', 'rpugh', 'khelson1', 'emcpPYTHON', 'horacers', 'JohnDoorenbos', 'dunfeer', 'techplex.engineer', 'misa.tesi', 'kwhillans', 'dr_bwilson', 'r70smith', 'tand', 'bschormann', 'traveltcg', 'mridgway', 'hhtsimpson', 'bmoening', 'mhilland', 'pam.meier', '[email protected]', 'bokhariz', 'MrYantho', 'ine5201', 'avunque', 'rasika', 'dtgreen', 'camingus', 'hogue', 'tstrat', 'stormeb', 'lwchadd', 'xty.dsouza', 'elevel', 'ThaddeusAid', 'davidUrr', 'hitoshi', 'mark.schmitt', 'eallatta', 'mfitzpatrick', 'vcowal', 'salihtuncer', 'warrenpettitt', 'pylearner', 'william.levi.collins', 'Barrie', 'Jacob777', 'vfxpraveen117', 'aoteatech', 'nask.meerpaal', 'jsolin', 'd.giordan', 'tmoffdawg', 'scmreason', 'Mr_Gray', 'case.andrew', 'trampgeek', 'niroshseven', 'decadous', 'MrBate', 'stcrispin', 'mieke.gorissen', 'englert.ervin', 'ericsonga', 'mithron', 'numodica', 'Mathad19', 'JenksW', 'LonnekeDriessen', 'kwaljee', 'larry', 'pentrium', 'moffdawg', 'maxmori77', 'SandeepShelke', 'davelai', 'tommrrtt', 'spock_tpol', 'ekeoid', 'BantamJoe', 'alter718_2', 'MegW', 'elanus', 'cn.gheorghetiteica', 'yoyo_available', 'sdmcgee', 'rdotsch', 'gjvoigt', 'gmacmanus', 'mharrison', 'jfemiani', 'femiani', 'kwright', 'dekrwright', 'jenniferchoffmann', 'mrschellenberg', 'The Salty Seaman', 'shane.dicks1', 'mleblanc', 'ProfNorman', 'metrpy', 'gregmax', 'ironwk85', 'hannutam', 'andrewgtp', 'realblurgh', 'madhavjha', 'acase', 'mchen53', 'jgilgen', 'ofenerci', 'JackTompkins', 'mdeakyne', 'spsofficers', 'comqsph', 'SteveTanimoto', 'jiffp', 'rkumar', 'Cmcguire', 'jheckelsmiller', 'OvidioMarinho', 'dianneoc', 'AppylPye', 'cwormsl2', 'kidbookrev', 'bic', 'afseTestAccount', 'vivek.lpc', '10081453', 'kaspermunch', 'supamrchen', 'johnkershaw', 'mesri', 'jfall', 'cheriegilmore', 'cewing', 'beth-mac', 'mcraig', 'jspacco', 'ranum', 'bcooper', 'xuxiaomin19892', 'cryptogoth', 'rowzer', 'allen', 'isaacdl', 'james adams', 'sciberteach', 'perry.fizzano', 'Green1313', 'cmheisel', 'wavemind', 'steven_king', 'assault', 'jgb', 'komo', '[email protected]', 'FEdericoSOmaschini', 'david236', 'zarial01', 'miguser1', 'kev_jones', 'JuYoungKim', 'garyphillips', 'jasonlanda', 'kfkennedy', 'holcma01', '[email protected]', 'johnpeeb', 'eudaimonious', 'VineetaSingh', 'rmueller', 'kash41', 'rtindell', 'mike.farfaraway', 'vilipold', 'weihongw', 'tonisvahi', 'rafaja', 'aliev', 'zacharyblackwood', 'smckennitt', 'drtech', 'bnminstruct', 'elliott', '[email protected]', 'CameronA', 'nicokie', 'volcan', 'matthewvenn', 'ttttamaki', '[email protected]', 'mpearse', 'gabrinerd', 'danschellenberg', 'alnitak')\n", | |
"--#list of instructors\n", | |
";\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"97999 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 48, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 48 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Remove known test courses. " | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Currently limited to just gatch and devcourse. Will revise when known test courses has been updated." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE course_id IN ('gatech', 'devcourse','overview');\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"106732 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 49, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 49 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Remove unofficial div_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pythondsDivInfo = getDivs.getDivInfo('pythonds')\n", | |
"pythondsDivs = pythondsDivInfo['div_id']\n", | |
"str(pythondsDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 50, | |
"text": [ | |
"\"['active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode']\"" | |
] | |
} | |
], | |
"prompt_number": 50 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE id in \n", | |
"(\n", | |
"SELECT useinfo.id FROM \n", | |
"useinfo\n", | |
"JOIN bookcourses\n", | |
"ON (useinfo.course_id = bookcourses.course_name)\n", | |
"WHERE \n", | |
" book = 'pythonds' \n", | |
" AND event NOT IN ('page','highlight','ac_error')\n", | |
" AND div_id NOT IN ('active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode')\n", | |
");\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"31112 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 51, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 51 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"thinkcspyDivInfo = getDivs.getDivInfo('thinkcspy')\n", | |
"thinkcspyDivs = thinkcspyDivInfo['div_id']\n", | |
"str(thinkcspyDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 52, | |
"text": [ | |
"\"['chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2']\"" | |
] | |
} | |
], | |
"prompt_number": 52 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE id IN\n", | |
"(\n", | |
"SELECT useinfo.id FROM useinfo\n", | |
"JOIN bookcourses\n", | |
"ON (useinfo.course_id = bookcourses.course_name)\n", | |
"WHERE \n", | |
" book = 'thinkcspy' \n", | |
" AND event NOT IN ('page','highlight','ac_error')\n", | |
" AND div_id NOT IN ('chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2')\n", | |
");\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"961911 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 53, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 53 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Find and remove infered test courses or inactive courses" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"In order to clean out test courses, we need to infer which book goes with each course. This will result in the creation of a bookcourses table, a table similar to courses but with the book tacked on the end." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"class CoursesForBook():\n", | |
" def __init__(self,book):\n", | |
" self.name = book\n", | |
" self.courses = []\n", | |
" self.keyWords = []\n", | |
" def addCourse(self,course):\n", | |
" if course not in self.courses:\n", | |
" self.courses.append(course)\n", | |
" def addKeyWords(self, wordlst):\n", | |
" for word in wordlst:\n", | |
" self.keyWords.append(word)\n", | |
" def __str__(self):\n", | |
" string = self.name + '\\n'+''\n", | |
" for i in self.courses:\n", | |
" string +=i + '\\n'\n", | |
" return string" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 54 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"book = bp.makeBookOutline('pythonds')\n", | |
"book2 = bp.makeBookOutline('thinkcspy')\n", | |
"\n", | |
"pythonds = CoursesForBook('pythonds')\n", | |
"pythonds.addKeyWords(['Analysis','Trees','BasicDS'])\n", | |
"thinkcspy = CoursesForBook('thinkcspy')\n", | |
"thinkcspy.addKeyWords(['Labs','SimplePythonData','GeneralIntro', 'moreiteration', 'helloturtle'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"thinkcspy.courses" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 56, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 56 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query = '''\n", | |
"SELECT distinct course_id, div_id\n", | |
"FROM useinfo where event = 'page'\n", | |
"'''\n", | |
"df = psql.read_sql(query,conn)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"count = 0\n", | |
"for num in df.index:\n", | |
" count += 1\n", | |
" if count % 500000 == 1:\n", | |
" print(count)\n", | |
" row = df.iloc[num]\n", | |
" course = row.course_id\n", | |
" div = row.div_id\n", | |
" for word in pythonds.keyWords:\n", | |
" try:\n", | |
" if word in div :\n", | |
" pythonds.addCourse(course)\n", | |
" except TypeError:\n", | |
" pass\n", | |
" for word in thinkcspy.keyWords:\n", | |
" try:\n", | |
" if word in div :\n", | |
" thinkcspy.addCourse(course)\n", | |
" except TypeError:\n", | |
" pass" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1\n" | |
] | |
} | |
], | |
"prompt_number": 58 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"len(pythonds.courses)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 59, | |
"text": [ | |
"14" | |
] | |
} | |
], | |
"prompt_number": 59 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"q = '''\n", | |
"SELECT * FROM courses\n", | |
"'''\n", | |
"course_ids = psql.read_sql(q, conn)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 60 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"These two lists are the same lists as pythonds.courses and thinkcspy.courses. I simply copied them to save time. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#temp1 = ['pythonds', 'COSC122', 'gatech', 'cs151ranum', 'cs22lbcc', 'cs22-lbcc', 'PaideiaCS2', 'CS136', 'GFU-Algorithms', 'GUDataStructures', 'PythonLevel2', 'luther151', 'devcourse']\n", | |
"#temp2 = ['thinkcspy', 'aaron-thinkpy', 'IntroCompScience', 'CSC131HUA', 'CSC131004Fall2013', 'STEM_LWSD', 'CSC131tagliarini', 'CS115', 'GalileoAOIT', 'green', '729G04-HT2013', 'ISTA130_FALL_2013', 'FlemingPSP', 'Duke_CompSci101', 'METR1313', 'CS_Python', 'HPComSci_IB', 'CS1-Python', 'PSSD_Python', 'cs150ranum', 'CS30', '152', 'luther150c_fall13', 'SER101F2013', 'scacs', 'IHSPython', 'CS101', 'python_13', 'SMU_CS101', 'CS1140', 'CS140-WCC-F13', 'ISYS217', 'python', 'ICS2O-3C', 'LearnPython', '10ACS', 'Python', 'ITLP-Python', 'csf13f', 'secondp', 'FCHSCP2013', 'wspython', '01418116_56', 'qwerty', 'Thacker', 'IntroProgrammingJHS', 'Griess_Python', 'ParsonsTest', 'ComputerProgramming', 'CSIS101', 'MrEasonECSFall2013', 'CS@WPS', 'CS_20', 'python4lindseya', 'CSC226', 'BC_GCSECompSci', 'lsmsacs244v', 'pyladies', 'CS244', 'ITEC-1150-01-S14', '152Haugrud', 'CSC131tompkinsj', 'ComputerLiteracyPython', 'bcscomputing', 'BigSnake', 'CSC112800', 'testcamp', 'IC-COMP-171-SP-14', 'ProjectNorKam', 'cs110williams', 'CSIS152Spr2014Ficek', 'SUS13', 'SMU_CS106', 'CSIS101W14', 'ANewCourse', 'SBC2014', 'CSGPython2', 'comp123-s14', 'cs150ranumspr2014', 'NewmanSKE2014', 'CS902', 'peeblescourse', 'Spring2014CS104', 'cf_pyintro', 'HTHcomptech', 'PythonASMSA', 'CodingLevel1', 'PythonLevel1', 'CASMTFeb14', 'Pythonintro', 'PythonIntro', 'neintrocs', 'CSIT902', 'CS902Spr14', 'cap-comp101', 'cs-147-s14', 'LASAIntroCS', 'cs902python', 'APCS-Python', 'WCC_CS140_SP14', 'RMBSProgrammingSkills', 'SolinMediaComp', 'csTestPrject', '1314SP_HCT2447', 'ITLP', 'CS101SP14', 'IS1008', 'ProvaAnn77Corso', 'BeensPy', 'python4bio', 'gatech', 'bet_test', 'devcourse', 'pyocc', 'craig-trial', 'CSC112-800', 'comp115', 'unics1140', 'luther150a', 'luther150bc', 'CMPUT 174 Fall 2012', 'CS-1P', 'CMPUT174Fall2012', 'HCHS_GCSE_Python3', 'overview', 'PHBCS', 'HCHS_GCSE_Python', 'COMP201', 'Explorations', 'SKECS2012PythonI', 'SKECS2013PythonI', 'Python101', 'UniOfHertsCS101', 'BMO', 'AMSACS', 'C152_Sum', 'NZTeach', 'IntroCC', 'Summer2013', 'CBCS_CS101', 'PythonWorkshop', 'OCC_CS_A131_FALL_13', 'Python1', 'bic-think-tutorial', 'test_tssi']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 61 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"If running anew, replace temp1 and temp2 with pythonds.courses and thinkcspy.courses respectively." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"f = (lambda x: 'pythonds' if x in pythonds.courses else None or 'thinkcspy' if x in thinkcspy.courses else None)\n", | |
"course_ids['book'] = course_ids['course_name'].map(f)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 62 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"course_ids\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>course_id</th>\n", | |
" <th>course_name</th>\n", | |
" <th>term_start_date</th>\n", | |
" <th>book</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0 </th>\n", | |
" <td> 144</td>\n", | |
" <td> None</td>\n", | |
" <td> CSC131tagliarini</td>\n", | |
" <td> 2013-08-20</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1 </th>\n", | |
" <td> 2</td>\n", | |
" <td> pythonds</td>\n", | |
" <td> pythonds</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> pythonds</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2 </th>\n", | |
" <td> 3</td>\n", | |
" <td> thinkcspy</td>\n", | |
" <td> thinkcspy</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3 </th>\n", | |
" <td> 139</td>\n", | |
" <td> None</td>\n", | |
" <td> HPComSci_IB</td>\n", | |
" <td> 2013-08-17</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4 </th>\n", | |
" <td> 116</td>\n", | |
" <td> None</td>\n", | |
" <td> a</td>\n", | |
" <td> 2013-08-07</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5 </th>\n", | |
" <td> 6</td>\n", | |
" <td> luther151</td>\n", | |
" <td> luther151</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> pythonds</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6 </th>\n", | |
" <td> 8</td>\n", | |
" <td> python4bio</td>\n", | |
" <td> python4bio</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7 </th>\n", | |
" <td> 88</td>\n", | |
" <td> Python 101</td>\n", | |
" <td> Python 101</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8 </th>\n", | |
" <td> 9</td>\n", | |
" <td> BeensPy</td>\n", | |
" <td> BeensPy</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9 </th>\n", | |
" <td> 10</td>\n", | |
" <td> pyocc</td>\n", | |
" <td> pyocc</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10 </th>\n", | |
" <td> 11</td>\n", | |
" <td> bet_test</td>\n", | |
" <td> bet_test</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11 </th>\n", | |
" <td> 117</td>\n", | |
" <td> None</td>\n", | |
" <td> Moffdawg</td>\n", | |
" <td> 2014-02-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12 </th>\n", | |
" <td> 119</td>\n", | |
" <td> None</td>\n", | |
" <td> web2py</td>\n", | |
" <td> 2013-08-12</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13 </th>\n", | |
" <td> 7</td>\n", | |
" <td> COSC122</td>\n", | |
" <td> COSC122</td>\n", | |
" <td> 2013-07-07</td>\n", | |
" <td> pythonds</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14 </th>\n", | |
" <td> 118</td>\n", | |
" <td> None</td>\n", | |
" <td> STEM 10th Grade Computer Programming</td>\n", | |
" <td> 2013-08-07</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15 </th>\n", | |
" <td> 13</td>\n", | |
" <td> CS 115</td>\n", | |
" <td> CS 115</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16 </th>\n", | |
" <td> 14</td>\n", | |
" <td> CS115</td>\n", | |
" <td> CS115</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17 </th>\n", | |
" <td> 15</td>\n", | |
" <td> NorKam Programming With Python</td>\n", | |
" <td> NorKam Programming With Python</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18 </th>\n", | |
" <td> 4</td>\n", | |
" <td> gatech</td>\n", | |
" <td> gatech</td>\n", | |
" <td> 2013-08-09</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19 </th>\n", | |
" <td> 120</td>\n", | |
" <td> None</td>\n", | |
" <td> Web2py sem Segredos</td>\n", | |
" <td> 2013-08-10</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20 </th>\n", | |
" <td> 121</td>\n", | |
" <td> None</td>\n", | |
" <td> Prj1</td>\n", | |
" <td> 2013-08-10</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21 </th>\n", | |
" <td> 122</td>\n", | |
" <td> None</td>\n", | |
" <td> test</td>\n", | |
" <td> 2013-08-10</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22 </th>\n", | |
" <td> 124</td>\n", | |
" <td> None</td>\n", | |
" <td> green</td>\n", | |
" <td> 2013-08-12</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23 </th>\n", | |
" <td> 16</td>\n", | |
" <td> wm_cs20</td>\n", | |
" <td> wm_cs20</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24 </th>\n", | |
" <td> 151</td>\n", | |
" <td> None</td>\n", | |
" <td> algopython</td>\n", | |
" <td> 2013-08-23</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25 </th>\n", | |
" <td> 125</td>\n", | |
" <td> None</td>\n", | |
" <td> AppliedGenomeInformatics</td>\n", | |
" <td> 2014-09-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26 </th>\n", | |
" <td> 126</td>\n", | |
" <td> None</td>\n", | |
" <td> AppliedGenomeInformatics2</td>\n", | |
" <td> 2014-09-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27 </th>\n", | |
" <td> 17</td>\n", | |
" <td> craig-trial</td>\n", | |
" <td> craig-trial</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28 </th>\n", | |
" <td> 152</td>\n", | |
" <td> None</td>\n", | |
" <td> none</td>\n", | |
" <td> 2013-08-23</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29 </th>\n", | |
" <td> 128</td>\n", | |
" <td> None</td>\n", | |
" <td> ISTA130_FALL_2013</td>\n", | |
" <td> 2013-08-26</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>380</th>\n", | |
" <td> 334</td>\n", | |
" <td> None</td>\n", | |
" <td> Spring2014CS104</td>\n", | |
" <td> 2014-02-01</td>\n", | |
" <td> thinkcspy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>381</th>\n", | |
" <td> 470</td>\n", | |
" <td> None</td>\n", | |
" <td> Choice</td>\n", | |
" <td> 2014-06-02</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>382</th>\n", | |
" <td> 499</td>\n", | |
" <td> None</td>\n", | |
" <td> sci30x</td>\n", | |
" <td> 2014-06-19</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>383</th>\n", | |
" <td> 500</td>\n", | |
" <td> None</td>\n", | |
" <td> sci30x2</td>\n", | |
" <td> 2014-12-11</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>384</th>\n", | |
" <td> 501</td>\n", | |
" <td> None</td>\n", | |
" <td> Decidability</td>\n", | |
" <td> 2014-06-19</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>385</th>\n", | |
" <td> 471</td>\n", | |
" <td> None</td>\n", | |
" <td> choice</td>\n", | |
" <td> 2014-05-27</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>386</th>\n", | |
" <td> 475</td>\n", | |
" <td> None</td>\n", | |
" <td> bnmtest91</td>\n", | |
" <td> 2014-05-28</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>387</th>\n", | |
" <td> 477</td>\n", | |
" <td> None</td>\n", | |
" <td> bnmtest92</td>\n", | |
" <td> 2014-05-28</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>388</th>\n", | |
" <td> 476</td>\n", | |
" <td> None</td>\n", | |
" <td> CS104Fall2014</td>\n", | |
" <td> 2014-09-02</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>389</th>\n", | |
" <td> 478</td>\n", | |
" <td> None</td>\n", | |
" <td> CCPS_Python</td>\n", | |
" <td> 2014-05-29</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>390</th>\n", | |
" <td> 479</td>\n", | |
" <td> None</td>\n", | |
" <td> ru_python</td>\n", | |
" <td> 2014-06-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>391</th>\n", | |
" <td> 481</td>\n", | |
" <td> None</td>\n", | |
" <td> TestCourse131234</td>\n", | |
" <td> 2014-06-04</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>392</th>\n", | |
" <td> 482</td>\n", | |
" <td> None</td>\n", | |
" <td> TestCourse131235</td>\n", | |
" <td> 2014-06-04</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>393</th>\n", | |
" <td> 483</td>\n", | |
" <td> None</td>\n", | |
" <td> TestCourse131236</td>\n", | |
" <td> 2014-06-04</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>394</th>\n", | |
" <td> 484</td>\n", | |
" <td> None</td>\n", | |
" <td> NYU-CSCI-UA2-005</td>\n", | |
" <td> 2014-09-01</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>395</th>\n", | |
" <td> 485</td>\n", | |
" <td> None</td>\n", | |
" <td> TestCourse131237</td>\n", | |
" <td> 2014-06-04</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>396</th>\n", | |
" <td> 486</td>\n", | |
" <td> None</td>\n", | |
" <td> lbcc-cs22</td>\n", | |
" <td> 2014-06-05</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>397</th>\n", | |
" <td> 487</td>\n", | |
" <td> None</td>\n", | |
" <td> WormsleyTest</td>\n", | |
" <td> 2014-06-05</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>398</th>\n", | |
" <td> 488</td>\n", | |
" <td> None</td>\n", | |
" <td> Programming1</td>\n", | |
" <td> 2014-06-06</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>399</th>\n", | |
" <td> 489</td>\n", | |
" <td> None</td>\n", | |
" <td> cchspython</td>\n", | |
" <td> 2014-06-06</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>400</th>\n", | |
" <td> 472</td>\n", | |
" <td> None</td>\n", | |
" <td> ChoiceCoding</td>\n", | |
" <td> 2014-05-27</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>401</th>\n", | |
" <td> 490</td>\n", | |
" <td> None</td>\n", | |
" <td> FirenzePasA042</td>\n", | |
" <td> 2014-06-11</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>402</th>\n", | |
" <td> 491</td>\n", | |
" <td> None</td>\n", | |
" <td> tp-Python</td>\n", | |
" <td> 2014-06-12</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>403</th>\n", | |
" <td> 492</td>\n", | |
" <td> None</td>\n", | |
" <td> Univ-of-Wash-CSE-373-Autumn-2014</td>\n", | |
" <td> 2014-09-24</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>404</th>\n", | |
" <td> 493</td>\n", | |
" <td> None</td>\n", | |
" <td> Ordinamentovettori</td>\n", | |
" <td> 2014-06-16</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>405</th>\n", | |
" <td> 494</td>\n", | |
" <td> None</td>\n", | |
" <td> CS902SU14</td>\n", | |
" <td> 2014-06-16</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>406</th>\n", | |
" <td> 495</td>\n", | |
" <td> None</td>\n", | |
" <td> 123</td>\n", | |
" <td> 2014-06-16</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>407</th>\n", | |
" <td> 503</td>\n", | |
" <td> None</td>\n", | |
" <td> ICS3U</td>\n", | |
" <td> 2014-06-18</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>408</th>\n", | |
" <td> 504</td>\n", | |
" <td> None</td>\n", | |
" <td> Learn_Pythom</td>\n", | |
" <td> 2014-06-20</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>409</th>\n", | |
" <td> 507</td>\n", | |
" <td> None</td>\n", | |
" <td> WestlakeCSP</td>\n", | |
" <td> 2014-06-22</td>\n", | |
" <td> None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>410 rows \u00d7 5 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 63, | |
"text": [ | |
" id course_id \\\n", | |
"0 144 None \n", | |
"1 2 pythonds \n", | |
"2 3 thinkcspy \n", | |
"3 139 None \n", | |
"4 116 None \n", | |
"5 6 luther151 \n", | |
"6 8 python4bio \n", | |
"7 88 Python 101 \n", | |
"8 9 BeensPy \n", | |
"9 10 pyocc \n", | |
"10 11 bet_test \n", | |
"11 117 None \n", | |
"12 119 None \n", | |
"13 7 COSC122 \n", | |
"14 118 None \n", | |
"15 13 CS 115 \n", | |
"16 14 CS115 \n", | |
"17 15 NorKam Programming With Python \n", | |
"18 4 gatech \n", | |
"19 120 None \n", | |
"20 121 None \n", | |
"21 122 None \n", | |
"22 124 None \n", | |
"23 16 wm_cs20 \n", | |
"24 151 None \n", | |
"25 125 None \n", | |
"26 126 None \n", | |
"27 17 craig-trial \n", | |
"28 152 None \n", | |
"29 128 None \n", | |
".. ... ... \n", | |
"380 334 None \n", | |
"381 470 None \n", | |
"382 499 None \n", | |
"383 500 None \n", | |
"384 501 None \n", | |
"385 471 None \n", | |
"386 475 None \n", | |
"387 477 None \n", | |
"388 476 None \n", | |
"389 478 None \n", | |
"390 479 None \n", | |
"391 481 None \n", | |
"392 482 None \n", | |
"393 483 None \n", | |
"394 484 None \n", | |
"395 485 None \n", | |
"396 486 None \n", | |
"397 487 None \n", | |
"398 488 None \n", | |
"399 489 None \n", | |
"400 472 None \n", | |
"401 490 None \n", | |
"402 491 None \n", | |
"403 492 None \n", | |
"404 493 None \n", | |
"405 494 None \n", | |
"406 495 None \n", | |
"407 503 None \n", | |
"408 504 None \n", | |
"409 507 None \n", | |
"\n", | |
" course_name term_start_date book \n", | |
"0 CSC131tagliarini 2013-08-20 thinkcspy \n", | |
"1 pythonds 2011-01-01 pythonds \n", | |
"2 thinkcspy 2011-01-01 thinkcspy \n", | |
"3 HPComSci_IB 2013-08-17 thinkcspy \n", | |
"4 a 2013-08-07 None \n", | |
"5 luther151 2011-01-01 pythonds \n", | |
"6 python4bio 2011-01-01 thinkcspy \n", | |
"7 Python 101 2011-01-01 None \n", | |
"8 BeensPy 2011-01-01 thinkcspy \n", | |
"9 pyocc 2011-01-01 thinkcspy \n", | |
"10 bet_test 2011-01-01 thinkcspy \n", | |
"11 Moffdawg 2014-02-01 thinkcspy \n", | |
"12 web2py 2013-08-12 None \n", | |
"13 COSC122 2013-07-07 pythonds \n", | |
"14 STEM 10th Grade Computer Programming 2013-08-07 None \n", | |
"15 CS 115 2011-01-01 None \n", | |
"16 CS115 2011-01-01 thinkcspy \n", | |
"17 NorKam Programming With Python 2011-01-01 None \n", | |
"18 gatech 2013-08-09 None \n", | |
"19 Web2py sem Segredos 2013-08-10 None \n", | |
"20 Prj1 2013-08-10 None \n", | |
"21 test 2013-08-10 None \n", | |
"22 green 2013-08-12 thinkcspy \n", | |
"23 wm_cs20 2011-01-01 None \n", | |
"24 algopython 2013-08-23 None \n", | |
"25 AppliedGenomeInformatics 2014-09-01 None \n", | |
"26 AppliedGenomeInformatics2 2014-09-01 None \n", | |
"27 craig-trial 2011-01-01 None \n", | |
"28 none 2013-08-23 None \n", | |
"29 ISTA130_FALL_2013 2013-08-26 thinkcspy \n", | |
".. ... ... ... \n", | |
"380 Spring2014CS104 2014-02-01 thinkcspy \n", | |
"381 Choice 2014-06-02 None \n", | |
"382 sci30x 2014-06-19 None \n", | |
"383 sci30x2 2014-12-11 None \n", | |
"384 Decidability 2014-06-19 None \n", | |
"385 choice 2014-05-27 None \n", | |
"386 bnmtest91 2014-05-28 None \n", | |
"387 bnmtest92 2014-05-28 None \n", | |
"388 CS104Fall2014 2014-09-02 None \n", | |
"389 CCPS_Python 2014-05-29 None \n", | |
"390 ru_python 2014-06-01 None \n", | |
"391 TestCourse131234 2014-06-04 None \n", | |
"392 TestCourse131235 2014-06-04 None \n", | |
"393 TestCourse131236 2014-06-04 None \n", | |
"394 NYU-CSCI-UA2-005 2014-09-01 None \n", | |
"395 TestCourse131237 2014-06-04 None \n", | |
"396 lbcc-cs22 2014-06-05 None \n", | |
"397 WormsleyTest 2014-06-05 None \n", | |
"398 Programming1 2014-06-06 None \n", | |
"399 cchspython 2014-06-06 None \n", | |
"400 ChoiceCoding 2014-05-27 None \n", | |
"401 FirenzePasA042 2014-06-11 None \n", | |
"402 tp-Python 2014-06-12 None \n", | |
"403 Univ-of-Wash-CSE-373-Autumn-2014 2014-09-24 None \n", | |
"404 Ordinamentovettori 2014-06-16 None \n", | |
"405 CS902SU14 2014-06-16 None \n", | |
"406 123 2014-06-16 None \n", | |
"407 ICS3U 2014-06-18 None \n", | |
"408 Learn_Pythom 2014-06-20 None \n", | |
"409 WestlakeCSP 2014-06-22 None \n", | |
"\n", | |
"[410 rows x 5 columns]" | |
] | |
} | |
], | |
"prompt_number": 63 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"course_ids.to_sql('bookcourses',engine2,if_exists='replace')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 64 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 6, | |
"metadata": {}, | |
"source": [ | |
"Removal starts here" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"CREATE INDEX bookIndex\n", | |
"ON bookcourses (index);\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Done.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 65, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query = '''\n", | |
"SELECT book, course_id ,clicks,numDivs,numSids FROM\n", | |
"(\n", | |
"SELECT book, course_name\n", | |
"FROM bookcourses\n", | |
") AS S\n", | |
"JOIN\n", | |
"(SELECT course_id, COUNT(*) AS clicks, COUNT(DISTINCT div_id) AS numDivs, COUNT(DISTINCT sid) AS numSids\n", | |
"FROM useinfo\n", | |
"GROUP BY course_id\n", | |
") AS T\n", | |
"ON (S.course_name = T.course_id)\n", | |
"'''\n", | |
"\n", | |
"course_idInfo = psql.read_sql(query, conn)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 66 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Collect all of the courses that don't have a book associated with them\n", | |
"noBook = course_idInfo[course_idInfo['book'] != 'pythonds']\n", | |
"noBook = noBook[noBook['book'] != 'thinkcspy']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 67 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = course_idInfo\n", | |
"df1 = df[df['numdivs'] < 5]\n", | |
"df2 = df[df['numsids'] < 5]\n", | |
"df3 = df[df['clicks']<100]\n", | |
"df4 = pd.merge(df1,df2,how='outer')\n", | |
"df5 = pd.merge(df3,df4,how='outer')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 68 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"remove = pd.merge(df5,noBook, how = 'outer')['course_id']\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 69 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"remove" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 70, | |
"text": [ | |
"0 9my0\n", | |
"1 ACcomp11\n", | |
"2 APCS-Python\n", | |
"3 AppliedGenomeInformatics2\n", | |
"4 BASE\n", | |
"5 BC_GCSECompSci\n", | |
"6 C152_Sum\n", | |
"7 CBCS\n", | |
"8 CS140\n", | |
"9 CSC100_J_term\n", | |
"10 CSIS152\n", | |
"11 CSIT902\n", | |
"12 CS_Python\n", | |
"13 CodingLevel1\n", | |
"14 DAM_00\n", | |
"...\n", | |
"75 septtest1\n", | |
"76 stolaf121AS14\n", | |
"77 tc_1\n", | |
"78 test_tssi\n", | |
"79 testcamp\n", | |
"80 testproject\n", | |
"81 IntroCC\n", | |
"82 MrEasonECSFall2013\n", | |
"83 NZTeach\n", | |
"84 PythonASMSA\n", | |
"85 SKECS2012PythonI\n", | |
"86 WPSCS\n", | |
"87 JavaReview\n", | |
"88 Learning\n", | |
"89 SER101F2013\n", | |
"Name: course_id, Length: 90, dtype: object" | |
] | |
} | |
], | |
"prompt_number": 70 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#This is for the clean database notebook. After making a table of the removed items, I'll remove those course_ids from useinfo \n", | |
"remove.to_sql('outlier_courses',engine2,if_exists='replace')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"CREATE INDEX ourlier_course_Index\n", | |
"ON outlier_courses (index);\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Done.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 72, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 72 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE course_id IN \n", | |
"(SELECT course_id FROM outlier_courses);\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"24827 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 73, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 73 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Delete Single Div Users from the database" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 5, | |
"metadata": {}, | |
"source": [ | |
"Pythonds" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo WHERE id IN\n", | |
"(\n", | |
"SELECT useinfo.id FROM useinfo JOIN bookcourses\n", | |
" ON (useinfo.course_id = bookcourses.course_name)\n", | |
"WHERE sid IN\n", | |
"(\n", | |
"SELECT sid\n", | |
"FROM \n", | |
"(\n", | |
" SELECT DISTINCT sid,div_id\n", | |
" FROM useinfo JOIN bookcourses\n", | |
" ON (useinfo.course_id = bookcourses.course_name)\n", | |
" WHERE book = 'pythonds'\n", | |
") AS foo\n", | |
"GROUP BY sid HAVING count(*) = 1\n", | |
")\n", | |
"AND book = 'pythonds');\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"432951 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 74, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 74 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 5, | |
"metadata": {}, | |
"source": [ | |
"Thinkcspy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo WHERE id IN\n", | |
"(\n", | |
"SELECT useinfo.id FROM useinfo JOIN bookcourses\n", | |
" ON (useinfo.course_id = bookcourses.course_name)\n", | |
"WHERE sid IN\n", | |
"(\n", | |
"SELECT sid\n", | |
"FROM \n", | |
"(\n", | |
" SELECT DISTINCT sid,div_id\n", | |
" FROM useinfo JOIN bookcourses\n", | |
" ON (useinfo.course_id = bookcourses.course_name)\n", | |
" WHERE book = 'thinkcspy'\n", | |
") AS foo\n", | |
"GROUP BY sid HAVING count(*) = 1\n", | |
")\n", | |
"AND book = 'thinkcspy');\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"520677 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 75, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 75 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Delete Short Term Users from the database" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE sid IN\n", | |
"(\n", | |
"--#Select all users whose duration on the website was less than an hour \n", | |
" SELECT sid FROM\n", | |
" \n", | |
" (\n", | |
" SELECT sid,timestamp,event,act,div_id,course_id \n", | |
" FROM \n", | |
" (\n", | |
" SELECT sid FROM useinfo\n", | |
" GROUP BY sid HAVING COUNT(*) > 1\n", | |
" ) AS T \n", | |
" NATURAL JOIN \n", | |
" (\n", | |
" SELECT sid,min(timestamp) startd, max(timestamp) endd FROM useinfo\n", | |
" GROUP BY sid\n", | |
" ) AS S \n", | |
" NATURAL JOIN \n", | |
" useinfo\n", | |
" WHERE (endd - startd) < ('1 day')::interval\n", | |
" ) AS mvu\n", | |
")\n", | |
";\n", | |
"COMMIT;\n", | |
" \n", | |
" \n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"3164790 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 76, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 76 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
" %%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE sid IN\n", | |
"(\n", | |
" SELECT sid FROM\n", | |
"--#Select all the users with only one action. \n", | |
" (\n", | |
" SELECT sid,timestamp,event,act,div_id,course_id FROM \n", | |
" (\n", | |
" SELECT sid FROM useinfo GROUP BY sid HAVING COUNT(*) < 2\n", | |
" ) AS R \n", | |
" NATURAL JOIN \n", | |
" useinfo\n", | |
" \n", | |
" ) AS svu\n", | |
" \n", | |
")\n", | |
";\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"10451 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 77, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 77 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 77 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Remove outliers with respect to clicks and duration." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"query = ''' \n", | |
"SELECT sid, min(timestamp) startd, max(timestamp) endd\n", | |
"FROM useinfo\n", | |
"GROUP BY sid\n", | |
"'''\n", | |
"\n", | |
"d = psql.frame_query(query,conn)\n", | |
"d.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sid</th>\n", | |
" <th>startd</th>\n", | |
" <th>endd</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> bcselvitazurkevica</td>\n", | |
" <td>2012-10-02 08:15:40</td>\n", | |
" <td>2014-02-11 21:10:49</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> jPayne</td>\n", | |
" <td>2013-09-10 15:43:59</td>\n", | |
" <td>2013-09-29 20:55:51</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> lisadawn</td>\n", | |
" <td>2013-08-21 21:21:58</td>\n", | |
" <td>2013-08-26 19:49:02</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> mackedee</td>\n", | |
" <td>2013-08-27 23:46:49</td>\n", | |
" <td>2013-09-10 01:17:01</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> +4dB</td>\n", | |
" <td>2013-07-14 01:20:04</td>\n", | |
" <td>2013-10-30 00:11:56</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 78, | |
"text": [ | |
" sid startd endd\n", | |
"0 bcselvitazurkevica 2012-10-02 08:15:40 2014-02-11 21:10:49\n", | |
"1 jPayne 2013-09-10 15:43:59 2013-09-29 20:55:51\n", | |
"2 lisadawn 2013-08-21 21:21:58 2013-08-26 19:49:02\n", | |
"3 mackedee 2013-08-27 23:46:49 2013-09-10 01:17:01\n", | |
"4 +4dB 2013-07-14 01:20:04 2013-10-30 00:11:56" | |
] | |
} | |
], | |
"prompt_number": 78 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"d['duration'] = d['endd'] - d['startd']\n", | |
"d['durdays'] = d['duration'].map(lambda x: x.astype('timedelta64[D]')) #takes a function as a parameter and" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 79 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = d[d.duration > np.timedelta64(1,'D')]\n", | |
"#df = df[df.sid != 'bmiller']\n", | |
"#df = df[df.sid != 'ranum' ]\n", | |
"df.index = df.sid" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 80 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"q = '''\n", | |
"SELECT sid, COUNT(*) AS numvisits FROM useinfo\n", | |
"GROUP BY sid \n", | |
"ORDER BY numvisits\n", | |
"'''\n", | |
"visitf = psql.frame_query(q,conn)\n", | |
"visitf.index = visitf.sid" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 81 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dfr = df['sid'].values\n", | |
"dfrs = set(dfr)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 82 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"count = 1\n", | |
"pairs = []\n", | |
"for u in df.index:\n", | |
" if u in visitf.index:\n", | |
" count += 1\n", | |
" pairs.append((int(df.ix[u].durdays)/1000000000/(60*60*24),visitf.ix[u].numvisits, df.ix[u].sid ))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 83 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"\n", | |
"pf = DataFrame( {'duration':[x[0] for x in pairs], 'clicks':[x[1] for x in pairs],'sid': [x[2] for x in pairs] } )\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 84 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def getOutliers(dataframe, dimension):\n", | |
" mean = dataframe[dimension].mean()\n", | |
" std = dataframe[dimension].std() \n", | |
"\n", | |
" newDF = dataframe[(dataframe[dimension] - mean)/std < 3.5]\n", | |
" return newDF\n", | |
" print(mean, \" \" , std)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 85 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"temp = getOutliers(pf, 'clicks') #Finds outliers with respects to clicks\n", | |
"temp1 = getOutliers(pf, 'duration') #Finds the rest of the outliers with respect to duration. " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 86 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"temp1.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>clicks</th>\n", | |
" <th>duration</th>\n", | |
" <th>sid</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 66</td>\n", | |
" <td> 19</td>\n", | |
" <td> jPayne</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 544</td>\n", | |
" <td> 4</td>\n", | |
" <td> lisadawn</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 209</td>\n", | |
" <td> 13</td>\n", | |
" <td> mackedee</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 171</td>\n", | |
" <td> 107</td>\n", | |
" <td> +4dB</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 1563</td>\n", | |
" <td> 111</td>\n", | |
" <td> 0v3r2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 87, | |
"text": [ | |
" clicks duration sid\n", | |
"1 66 19 jPayne\n", | |
"2 544 4 lisadawn\n", | |
"3 209 13 mackedee\n", | |
"4 171 107 +4dB\n", | |
"5 1563 111 0v3r2" | |
] | |
} | |
], | |
"prompt_number": 87 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"outliers = pd.merge(temp,temp1, how ='outer', on ='sid')\n", | |
"temp1.to_sql('outlierssid',engine2,if_exists='replace')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 88 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"CREATE INDEX outlierIndex\n", | |
"ON outlierssid (sid);\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Done.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 89, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 89 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"DELETE FROM useinfo\n", | |
"WHERE sid IN \n", | |
" (SELECT sid \n", | |
" FROM useinfo\n", | |
" NATURAL JOIN \n", | |
" (\n", | |
" SELECT sid \n", | |
" FROM useinfo\n", | |
" EXCEPT \n", | |
" (SELECT sid \n", | |
" FROM outlierssid\n", | |
" )\n", | |
" ) \n", | |
" AS T\n", | |
" );\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"628996 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 90, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 90 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"SELECT count(*) FROM useinfo;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 rows affected.\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>9357764</td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 91, | |
"text": [ | |
"[(9357764,)]" | |
] | |
} | |
], | |
"prompt_number": 91 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Replace courselib with runestone" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"UPDATE useinfo\n", | |
"SET div_id = regexp_replace(div_id,'^/courselib(.*)','/runestone\\1');\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"9357764 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 92, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 92 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Replace mChoice with mchoice" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql \n", | |
"UPDATE useinfo\n", | |
"SET event = 'mchoice'\n", | |
"WHERE event = 'mChoice';\n", | |
"\n", | |
"COMMIT;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1166988 rows affected.\n", | |
"Done.\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 93, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 93 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"SELECT * FROM useinfo\n", | |
"where event = 'mChoice';" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"0 rows affected.\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <th>id</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>sid</th>\n", | |
" <th>event</th>\n", | |
" <th>act</th>\n", | |
" <th>div_id</th>\n", | |
" <th>course_id</th>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 94, | |
"text": [ | |
"[]" | |
] | |
} | |
], | |
"prompt_number": 94 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"SELECT count(*) FROM (\n", | |
"SELECT DISTINCT sid FROM useinfo) AS t;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 rows affected.\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>81894</td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 95, | |
"text": [ | |
"[(81894,)]" | |
] | |
} | |
], | |
"prompt_number": 95 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%sql\n", | |
"SELECT count(*) FROM useinfo;" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 rows affected.\n" | |
] | |
}, | |
{ | |
"html": [ | |
"<table>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>9357764</td>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 96, | |
"text": [ | |
"[(9357764,)]" | |
] | |
} | |
], | |
"prompt_number": 96 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 96 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment