Created
November 3, 2013 12:49
-
-
Save pronojitsaha/7289970 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!head -n 4 data/recsys-datawa4.csv" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\"\",\"1648\",\"5136\",\"918\",\"2824\",\"3867\",\"860\",\"3712\",\"2968\",\"3525\",\"4323\",\"3617\",\"4360\",\"2756\",\"89\",\"442\",\"3556\",\"5261\",\"2492\",\"5062\",\"2486\",\"4942\",\"2267\",\"4809\",\"3853\",\"2288\"\r\n", | |
"\"11: Star Wars: Episode IV - A New Hope (1977)\",,4.5,5,4.5,4,4,,5,4,5,,4,,4,3,4,,4.5,4,3.5,,,,,\r\n", | |
"\"12: Finding Nemo (2003)\",,5,5,,4,4,4.5,4.5,4,5,,4,5,4.5,,4,,3.5,4,2,3.5,,,,3.5\r\n", | |
"\"13: Forrest Gump (1994)\",,5,4.5,5,4.5,4.5,,5,4.5,5,5,4.5,4.5,5,3,4,5,3.5,4.5,4.5,4,3.5,4.5,3.5,3.5\r\n" | |
] | |
} | |
], | |
"prompt_number": 70 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data = pd.read_csv('data/recsys-datawa4.csv', index_col=0, header=0)\n", | |
"data['3712']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 55, | |
"text": [ | |
"11: Star Wars: Episode IV - A New Hope (1977) NaN\n", | |
"12: Finding Nemo (2003) 4.5\n", | |
"13: Forrest Gump (1994) NaN\n", | |
"14: American Beauty (1999) 4.5\n", | |
"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) NaN\n", | |
"24: Kill Bill: Vol. 1 (2003) NaN\n", | |
"38: Eternal Sunshine of the Spotless Mind (2004) NaN\n", | |
"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) NaN\n", | |
"77: Memento (2000) NaN\n", | |
"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) NaN\n", | |
"98: Gladiator (2000) NaN\n", | |
"105: Back to the Future (1985) NaN\n", | |
"107: Snatch (2000) NaN\n", | |
"114: Pretty Woman (1990) 4.0\n", | |
"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n", | |
"...\n", | |
"3049: Ace Ventura: Pet Detective (1994) 5.0\n", | |
"4327: Charlie's Angels (2000) NaN\n", | |
"5503: The Fugitive (1993) 4.0\n", | |
"7443: Chicken Run (2000) 4.0\n", | |
"8358: Cast Away (2000) NaN\n", | |
"8467: Dumb & Dumber (1994) 4.5\n", | |
"8587: The Lion King (1994) NaN\n", | |
"9331: Clear and Present Danger (1994) NaN\n", | |
"9741: Unbreakable (2000) NaN\n", | |
"9802: The Rock (1996) 4.0\n", | |
"9806: The Incredibles (2004) 4.5\n", | |
"10020: Beauty and the Beast (1991) NaN\n", | |
"36657: X-Men (2000) 4.5\n", | |
"36658: X2: X-Men United (2003) 4.5\n", | |
"36955: True Lies (1994) 4.0\n", | |
"Name: 3712, Length: 100, dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"Index([u'1648', u'5136', u'918', u'2824', u'3867', u'860', u'3712', u'2968', u'3525', u'4323', u'3617', u'4360', u'2756', u'89', u'442', u'3556', u'5261', u'2492', u'5062', u'2486', u'4942', u'2267', u'4809', u'3853', u'2288'], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Calculate the user-user correlations:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"resultcor = pd.DataFrame({}, index = data.columns)\n", | |
"for col in data.columns:\n", | |
" result = []\n", | |
" for i in range(25):\n", | |
" result.append(data[col].corr(data.ix[:,i]))\n", | |
" resultcor[col] = result\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The top 5 users for a particular user_id. (To calculate for others just change the user_id variable.)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"user_id = '3712'\n", | |
"top5 = resultcor[user_id].order(ascending=False)[1:6]\n", | |
"top5" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 56, | |
"text": [ | |
"2824 0.462910\n", | |
"3867 0.400275\n", | |
"5062 0.247693\n", | |
"442 0.227130\n", | |
"3853 0.193660\n", | |
"Name: 3712, dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 56 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Fill the NaN values with 0 so that they can be excluded from the calculations." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data1 = data.fillna(0)\n", | |
"data1['3712']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 63, | |
"text": [ | |
"11: Star Wars: Episode IV - A New Hope (1977) 0.0\n", | |
"12: Finding Nemo (2003) 4.5\n", | |
"13: Forrest Gump (1994) 0.0\n", | |
"14: American Beauty (1999) 4.5\n", | |
"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) 0.0\n", | |
"24: Kill Bill: Vol. 1 (2003) 0.0\n", | |
"38: Eternal Sunshine of the Spotless Mind (2004) 0.0\n", | |
"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) 0.0\n", | |
"77: Memento (2000) 0.0\n", | |
"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 0.0\n", | |
"98: Gladiator (2000) 0.0\n", | |
"105: Back to the Future (1985) 0.0\n", | |
"107: Snatch (2000) 0.0\n", | |
"114: Pretty Woman (1990) 4.0\n", | |
"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n", | |
"...\n", | |
"3049: Ace Ventura: Pet Detective (1994) 5.0\n", | |
"4327: Charlie's Angels (2000) 0.0\n", | |
"5503: The Fugitive (1993) 4.0\n", | |
"7443: Chicken Run (2000) 4.0\n", | |
"8358: Cast Away (2000) 0.0\n", | |
"8467: Dumb & Dumber (1994) 4.5\n", | |
"8587: The Lion King (1994) 0.0\n", | |
"9331: Clear and Present Danger (1994) 0.0\n", | |
"9741: Unbreakable (2000) 0.0\n", | |
"9802: The Rock (1996) 4.0\n", | |
"9806: The Incredibles (2004) 4.5\n", | |
"10020: Beauty and the Beast (1991) 0.0\n", | |
"36657: X-Men (2000) 4.5\n", | |
"36658: X2: X-Men United (2003) 4.5\n", | |
"36955: True Lies (1994) 4.0\n", | |
"Name: 3712, Length: 100, dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 63 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Calculate the numerator & denominator values separately. For numerator, iterate over the 100 movies and for each i-th movie iterate over the top 5 user's (top5.index) ratings (data1[index][i]) for that i-th movie and multiplying it with the top 5 user's correlation (top5[index]). For denominator just append the top 5 users correlations. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"num = {}\n", | |
"denom = {}\n", | |
"for i in range(100):\n", | |
" num[data1.index[i]]= []\n", | |
" denom[data1.index[i]] = []\n", | |
" for index in top5.index:\n", | |
" if (data1[index][i] == 0):\n", | |
" continue\n", | |
" else:\n", | |
" num[data1.index[i]].append(data1[index][i]*top5[index])\n", | |
" denom[data1.index[i]].append(top5[index])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 64 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Calculate the rating for each of the 100 movies as found in denom.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"rating = {}\n", | |
"for key in denom.keys():\n", | |
" rating[key] = []\n", | |
" if (sum(denom[key]) == 0):\n", | |
" continue\n", | |
" else:\n", | |
" rating[key].append(sum(num[key])/sum(denom[key]))\n", | |
" \n", | |
"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 65, | |
"text": [ | |
"[('641: Requiem for a Dream (2000)', [4.9999999999999991]),\n", | |
" ('603: The Matrix (1999)', [4.855924039087955]),\n", | |
" ('105: Back to the Future (1985)', [4.7391731227107821]),\n", | |
" ('107: Snatch (2000)', [4.6514324313400932]),\n", | |
" ('155: The Dark Knight (2008)', [4.6225640068346863])]" | |
] | |
} | |
], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Normalization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"user_id = '3712'\n", | |
"top5 = resultcor[user_id].order(ascending=False)[1:6]\n", | |
"top5" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 66, | |
"text": [ | |
"2824 0.462910\n", | |
"3867 0.400275\n", | |
"5062 0.247693\n", | |
"442 0.227130\n", | |
"3853 0.193660\n", | |
"Name: 3712, dtype: float64" | |
] | |
} | |
], | |
"prompt_number": 66 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"data1 = data.fillna(0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 67 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"num = {}\n", | |
"denom = {}\n", | |
"for i in range(100):\n", | |
" num[data1.index[i]]= []\n", | |
" denom[data1.index[i]] = []\n", | |
" for index in top5.index:\n", | |
" if (data1[index][i] == 0):\n", | |
" continue\n", | |
" else:\n", | |
" a = (data1[index][i]-data[index].mean())*top5[index]\n", | |
" num[data1.index[i]].append(a)\n", | |
" denom[data1.index[i]].append(top5[index])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 68 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"rating = {}\n", | |
"for key in denom.keys():\n", | |
" rating[key] = []\n", | |
" if (sum(denom[key]) == 0):\n", | |
" continue\n", | |
" else:\n", | |
" b = data[user_id].mean() + (sum(num[key])/sum(denom[key]))\n", | |
" rating[key].append(b)\n", | |
" \n", | |
"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 69, | |
"text": [ | |
"[('641: Requiem for a Dream (2000)', [5.9000000000000004]),\n", | |
" ('603: The Matrix (1999)', [5.5455667767774663]),\n", | |
" ('105: Back to the Future (1985)', [5.5005845132854665]),\n", | |
" ('155: The Dark Knight (2008)', [5.3122067445241967]),\n", | |
" ('121: The Lord of the Rings: The Two Towers (2002)', [5.3065590950228403])]" | |
] | |
} | |
], | |
"prompt_number": 69 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment