Skip to content

Instantly share code, notes, and snippets.

@pronojitsaha
Created November 3, 2013 12:49
Show Gist options
  • Save pronojitsaha/7289970 to your computer and use it in GitHub Desktop.
Save pronojitsaha/7289970 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head -n 4 data/recsys-datawa4.csv"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\"\",\"1648\",\"5136\",\"918\",\"2824\",\"3867\",\"860\",\"3712\",\"2968\",\"3525\",\"4323\",\"3617\",\"4360\",\"2756\",\"89\",\"442\",\"3556\",\"5261\",\"2492\",\"5062\",\"2486\",\"4942\",\"2267\",\"4809\",\"3853\",\"2288\"\r\n",
"\"11: Star Wars: Episode IV - A New Hope (1977)\",,4.5,5,4.5,4,4,,5,4,5,,4,,4,3,4,,4.5,4,3.5,,,,,\r\n",
"\"12: Finding Nemo (2003)\",,5,5,,4,4,4.5,4.5,4,5,,4,5,4.5,,4,,3.5,4,2,3.5,,,,3.5\r\n",
"\"13: Forrest Gump (1994)\",,5,4.5,5,4.5,4.5,,5,4.5,5,5,4.5,4.5,5,3,4,5,3.5,4.5,4.5,4,3.5,4.5,3.5,3.5\r\n"
]
}
],
"prompt_number": 70
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data = pd.read_csv('data/recsys-datawa4.csv', index_col=0, header=0)\n",
"data['3712']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 55,
"text": [
"11: Star Wars: Episode IV - A New Hope (1977) NaN\n",
"12: Finding Nemo (2003) 4.5\n",
"13: Forrest Gump (1994) NaN\n",
"14: American Beauty (1999) 4.5\n",
"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) NaN\n",
"24: Kill Bill: Vol. 1 (2003) NaN\n",
"38: Eternal Sunshine of the Spotless Mind (2004) NaN\n",
"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) NaN\n",
"77: Memento (2000) NaN\n",
"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) NaN\n",
"98: Gladiator (2000) NaN\n",
"105: Back to the Future (1985) NaN\n",
"107: Snatch (2000) NaN\n",
"114: Pretty Woman (1990) 4.0\n",
"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n",
"...\n",
"3049: Ace Ventura: Pet Detective (1994) 5.0\n",
"4327: Charlie's Angels (2000) NaN\n",
"5503: The Fugitive (1993) 4.0\n",
"7443: Chicken Run (2000) 4.0\n",
"8358: Cast Away (2000) NaN\n",
"8467: Dumb & Dumber (1994) 4.5\n",
"8587: The Lion King (1994) NaN\n",
"9331: Clear and Present Danger (1994) NaN\n",
"9741: Unbreakable (2000) NaN\n",
"9802: The Rock (1996) 4.0\n",
"9806: The Incredibles (2004) 4.5\n",
"10020: Beauty and the Beast (1991) NaN\n",
"36657: X-Men (2000) 4.5\n",
"36658: X2: X-Men United (2003) 4.5\n",
"36955: True Lies (1994) 4.0\n",
"Name: 3712, Length: 100, dtype: float64"
]
}
],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.columns"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"Index([u'1648', u'5136', u'918', u'2824', u'3867', u'860', u'3712', u'2968', u'3525', u'4323', u'3617', u'4360', u'2756', u'89', u'442', u'3556', u'5261', u'2492', u'5062', u'2486', u'4942', u'2267', u'4809', u'3853', u'2288'], dtype=object)"
]
}
],
"prompt_number": 4
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Calculate the user-user correlations:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"resultcor = pd.DataFrame({}, index = data.columns)\n",
"for col in data.columns:\n",
" result = []\n",
" for i in range(25):\n",
" result.append(data[col].corr(data.ix[:,i]))\n",
" resultcor[col] = result\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The top 5 users for a particular user_id. (To calculate for others just change the user_id variable.)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"user_id = '3712'\n",
"top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
"top5"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 56,
"text": [
"2824 0.462910\n",
"3867 0.400275\n",
"5062 0.247693\n",
"442 0.227130\n",
"3853 0.193660\n",
"Name: 3712, dtype: float64"
]
}
],
"prompt_number": 56
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Fill the NaN values with 0 so that they can be excluded from the calculations."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data1 = data.fillna(0)\n",
"data1['3712']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 63,
"text": [
"11: Star Wars: Episode IV - A New Hope (1977) 0.0\n",
"12: Finding Nemo (2003) 4.5\n",
"13: Forrest Gump (1994) 0.0\n",
"14: American Beauty (1999) 4.5\n",
"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) 0.0\n",
"24: Kill Bill: Vol. 1 (2003) 0.0\n",
"38: Eternal Sunshine of the Spotless Mind (2004) 0.0\n",
"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) 0.0\n",
"77: Memento (2000) 0.0\n",
"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 0.0\n",
"98: Gladiator (2000) 0.0\n",
"105: Back to the Future (1985) 0.0\n",
"107: Snatch (2000) 0.0\n",
"114: Pretty Woman (1990) 4.0\n",
"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n",
"...\n",
"3049: Ace Ventura: Pet Detective (1994) 5.0\n",
"4327: Charlie's Angels (2000) 0.0\n",
"5503: The Fugitive (1993) 4.0\n",
"7443: Chicken Run (2000) 4.0\n",
"8358: Cast Away (2000) 0.0\n",
"8467: Dumb & Dumber (1994) 4.5\n",
"8587: The Lion King (1994) 0.0\n",
"9331: Clear and Present Danger (1994) 0.0\n",
"9741: Unbreakable (2000) 0.0\n",
"9802: The Rock (1996) 4.0\n",
"9806: The Incredibles (2004) 4.5\n",
"10020: Beauty and the Beast (1991) 0.0\n",
"36657: X-Men (2000) 4.5\n",
"36658: X2: X-Men United (2003) 4.5\n",
"36955: True Lies (1994) 4.0\n",
"Name: 3712, Length: 100, dtype: float64"
]
}
],
"prompt_number": 63
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Calculate the numerator & denominator values separately. For numerator, iterate over the 100 movies and for each i-th movie iterate over the top 5 user's (top5.index) ratings (data1[index][i]) for that i-th movie and multiplying it with the top 5 user's correlation (top5[index]). For denominator just append the top 5 users correlations. "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"num = {}\n",
"denom = {}\n",
"for i in range(100):\n",
" num[data1.index[i]]= []\n",
" denom[data1.index[i]] = []\n",
" for index in top5.index:\n",
" if (data1[index][i] == 0):\n",
" continue\n",
" else:\n",
" num[data1.index[i]].append(data1[index][i]*top5[index])\n",
" denom[data1.index[i]].append(top5[index])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 64
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Calculate the rating for each of the 100 movies as found in denom.keys()"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rating = {}\n",
"for key in denom.keys():\n",
" rating[key] = []\n",
" if (sum(denom[key]) == 0):\n",
" continue\n",
" else:\n",
" rating[key].append(sum(num[key])/sum(denom[key]))\n",
" \n",
"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 65,
"text": [
"[('641: Requiem for a Dream (2000)', [4.9999999999999991]),\n",
" ('603: The Matrix (1999)', [4.855924039087955]),\n",
" ('105: Back to the Future (1985)', [4.7391731227107821]),\n",
" ('107: Snatch (2000)', [4.6514324313400932]),\n",
" ('155: The Dark Knight (2008)', [4.6225640068346863])]"
]
}
],
"prompt_number": 65
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Normalization"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"user_id = '3712'\n",
"top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
"top5"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 66,
"text": [
"2824 0.462910\n",
"3867 0.400275\n",
"5062 0.247693\n",
"442 0.227130\n",
"3853 0.193660\n",
"Name: 3712, dtype: float64"
]
}
],
"prompt_number": 66
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data1 = data.fillna(0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 67
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"num = {}\n",
"denom = {}\n",
"for i in range(100):\n",
" num[data1.index[i]]= []\n",
" denom[data1.index[i]] = []\n",
" for index in top5.index:\n",
" if (data1[index][i] == 0):\n",
" continue\n",
" else:\n",
" a = (data1[index][i]-data[index].mean())*top5[index]\n",
" num[data1.index[i]].append(a)\n",
" denom[data1.index[i]].append(top5[index])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 68
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rating = {}\n",
"for key in denom.keys():\n",
" rating[key] = []\n",
" if (sum(denom[key]) == 0):\n",
" continue\n",
" else:\n",
" b = data[user_id].mean() + (sum(num[key])/sum(denom[key]))\n",
" rating[key].append(b)\n",
" \n",
"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 69,
"text": [
"[('641: Requiem for a Dream (2000)', [5.9000000000000004]),\n",
" ('603: The Matrix (1999)', [5.5455667767774663]),\n",
" ('105: Back to the Future (1985)', [5.5005845132854665]),\n",
" ('155: The Dark Knight (2008)', [5.3122067445241967]),\n",
" ('121: The Lord of the Rings: The Two Towers (2002)', [5.3065590950228403])]"
]
}
],
"prompt_number": 69
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment