pronojitsaha · November 3, 2013 12:49
diff --git a/Recsys-WA4.ipynb b/Recsys-WA4.ipynb
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!head -n 4 data/recsys-datawa4.csv"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\"\",\"1648\",\"5136\",\"918\",\"2824\",\"3867\",\"860\",\"3712\",\"2968\",\"3525\",\"4323\",\"3617\",\"4360\",\"2756\",\"89\",\"442\",\"3556\",\"5261\",\"2492\",\"5062\",\"2486\",\"4942\",\"2267\",\"4809\",\"3853\",\"2288\"\r\n",
        "\"11: Star Wars: Episode IV - A New Hope (1977)\",,4.5,5,4.5,4,4,,5,4,5,,4,,4,3,4,,4.5,4,3.5,,,,,\r\n",
        "\"12: Finding Nemo (2003)\",,5,5,,4,4,4.5,4.5,4,5,,4,5,4.5,,4,,3.5,4,2,3.5,,,,3.5\r\n",
        "\"13: Forrest Gump (1994)\",,5,4.5,5,4.5,4.5,,5,4.5,5,5,4.5,4.5,5,3,4,5,3.5,4.5,4.5,4,3.5,4.5,3.5,3.5\r\n"
       ]
      }
     ],
     "prompt_number": 70
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data = pd.read_csv('data/recsys-datawa4.csv', index_col=0, header=0)\n",
      "data['3712']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 55,
       "text": [
        "11: Star Wars: Episode IV - A New Hope (1977)                   NaN\n",
        "12: Finding Nemo (2003)                                         4.5\n",
        "13: Forrest Gump (1994)                                         NaN\n",
        "14: American Beauty (1999)                                      4.5\n",
        "22: Pirates of the Caribbean: The Curse of the Black Pearl (2003)    NaN\n",
        "24: Kill Bill: Vol. 1 (2003)                                    NaN\n",
        "38: Eternal Sunshine of the Spotless Mind (2004)                NaN\n",
        "63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995)                   NaN\n",
        "77: Memento (2000)                                              NaN\n",
        "85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    NaN\n",
        "98: Gladiator (2000)                                            NaN\n",
        "105: Back to the Future (1985)                                  NaN\n",
        "107: Snatch (2000)                                              NaN\n",
        "114: Pretty Woman (1990)                                        4.0\n",
        "120: The Lord of the Rings: The Fellowship of the Ring (2001)    5.0\n",
        "...\n",
        "3049: Ace Ventura: Pet Detective (1994)    5.0\n",
        "4327: Charlie's Angels (2000)              NaN\n",
        "5503: The Fugitive (1993)                  4.0\n",
        "7443: Chicken Run (2000)                   4.0\n",
        "8358: Cast Away (2000)                     NaN\n",
        "8467: Dumb & Dumber (1994)                 4.5\n",
        "8587: The Lion King (1994)                 NaN\n",
        "9331: Clear and Present Danger (1994)      NaN\n",
        "9741: Unbreakable (2000)                   NaN\n",
        "9802: The Rock (1996)                      4.0\n",
        "9806: The Incredibles (2004)               4.5\n",
        "10020: Beauty and the Beast (1991)         NaN\n",
        "36657: X-Men (2000)                        4.5\n",
        "36658: X2: X-Men United (2003)             4.5\n",
        "36955: True Lies (1994)                    4.0\n",
        "Name: 3712, Length: 100, dtype: float64"
       ]
      }
     ],
     "prompt_number": 55
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data.columns"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "Index([u'1648', u'5136', u'918', u'2824', u'3867', u'860', u'3712', u'2968', u'3525', u'4323', u'3617', u'4360', u'2756', u'89', u'442', u'3556', u'5261', u'2492', u'5062', u'2486', u'4942', u'2267', u'4809', u'3853', u'2288'], dtype=object)"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Calculate the user-user correlations:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "resultcor = pd.DataFrame({}, index = data.columns)\n",
      "for col in data.columns:\n",
      "    result = []\n",
      "    for i in range(25):\n",
      "        result.append(data[col].corr(data.ix[:,i]))\n",
      "    resultcor[col] = result\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The top 5 users for a particular user_id. (To calculate for others just change the user_id variable.)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "user_id = '3712'\n",
      "top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
      "top5"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 56,
       "text": [
        "2824    0.462910\n",
        "3867    0.400275\n",
        "5062    0.247693\n",
        "442     0.227130\n",
        "3853    0.193660\n",
        "Name: 3712, dtype: float64"
       ]
      }
     ],
     "prompt_number": 56
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Fill the NaN values with 0 so that they can be excluded from the calculations."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data1 = data.fillna(0)\n",
      "data1['3712']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 63,
       "text": [
        "11: Star Wars: Episode IV - A New Hope (1977)                   0.0\n",
        "12: Finding Nemo (2003)                                         4.5\n",
        "13: Forrest Gump (1994)                                         0.0\n",
        "14: American Beauty (1999)                                      4.5\n",
        "22: Pirates of the Caribbean: The Curse of the Black Pearl (2003)    0.0\n",
        "24: Kill Bill: Vol. 1 (2003)                                    0.0\n",
        "38: Eternal Sunshine of the Spotless Mind (2004)                0.0\n",
        "63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995)                   0.0\n",
        "77: Memento (2000)                                              0.0\n",
        "85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.0\n",
        "98: Gladiator (2000)                                            0.0\n",
        "105: Back to the Future (1985)                                  0.0\n",
        "107: Snatch (2000)                                              0.0\n",
        "114: Pretty Woman (1990)                                        4.0\n",
        "120: The Lord of the Rings: The Fellowship of the Ring (2001)    5.0\n",
        "...\n",
        "3049: Ace Ventura: Pet Detective (1994)    5.0\n",
        "4327: Charlie's Angels (2000)              0.0\n",
        "5503: The Fugitive (1993)                  4.0\n",
        "7443: Chicken Run (2000)                   4.0\n",
        "8358: Cast Away (2000)                     0.0\n",
        "8467: Dumb & Dumber (1994)                 4.5\n",
        "8587: The Lion King (1994)                 0.0\n",
        "9331: Clear and Present Danger (1994)      0.0\n",
        "9741: Unbreakable (2000)                   0.0\n",
        "9802: The Rock (1996)                      4.0\n",
        "9806: The Incredibles (2004)               4.5\n",
        "10020: Beauty and the Beast (1991)         0.0\n",
        "36657: X-Men (2000)                        4.5\n",
        "36658: X2: X-Men United (2003)             4.5\n",
        "36955: True Lies (1994)                    4.0\n",
        "Name: 3712, Length: 100, dtype: float64"
       ]
      }
     ],
     "prompt_number": 63
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Calculate the numerator & denominator values separately. For numerator, iterate over the 100 movies and for each i-th movie iterate over the top 5 user's (top5.index) ratings (data1[index][i]) for that i-th movie and multiplying it with the top 5 user's correlation (top5[index]). For denominator just append the top 5 users correlations. "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "num = {}\n",
      "denom = {}\n",
      "for i in range(100):\n",
      "    num[data1.index[i]]= []\n",
      "    denom[data1.index[i]] = []\n",
      "    for index in top5.index:\n",
      "        if (data1[index][i] == 0):\n",
      "            continue\n",
      "        else:\n",
      "            num[data1.index[i]].append(data1[index][i]*top5[index])\n",
      "            denom[data1.index[i]].append(top5[index])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 64
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Calculate the rating for each of the 100 movies as found in denom.keys()"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rating = {}\n",
      "for key in denom.keys():\n",
      "    rating[key] = []\n",
      "    if (sum(denom[key]) == 0):\n",
      "        continue\n",
      "    else:\n",
      "        rating[key].append(sum(num[key])/sum(denom[key]))\n",
      "       \n",
      "sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 65,
       "text": [
        "[('641: Requiem for a Dream (2000)', [4.9999999999999991]),\n",
        " ('603: The Matrix (1999)', [4.855924039087955]),\n",
        " ('105: Back to the Future (1985)', [4.7391731227107821]),\n",
        " ('107: Snatch (2000)', [4.6514324313400932]),\n",
        " ('155: The Dark Knight (2008)', [4.6225640068346863])]"
       ]
      }
     ],
     "prompt_number": 65
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Normalization"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "user_id = '3712'\n",
      "top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
      "top5"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 66,
       "text": [
        "2824    0.462910\n",
        "3867    0.400275\n",
        "5062    0.247693\n",
        "442     0.227130\n",
        "3853    0.193660\n",
        "Name: 3712, dtype: float64"
       ]
      }
     ],
     "prompt_number": 66
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data1 = data.fillna(0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 67
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "num = {}\n",
      "denom = {}\n",
      "for i in range(100):\n",
      "    num[data1.index[i]]= []\n",
      "    denom[data1.index[i]] = []\n",
      "    for index in top5.index:\n",
      "        if (data1[index][i] == 0):\n",
      "            continue\n",
      "        else:\n",
      "            a = (data1[index][i]-data[index].mean())*top5[index]\n",
      "            num[data1.index[i]].append(a)\n",
      "            denom[data1.index[i]].append(top5[index])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 68
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "rating = {}\n",
      "for key in denom.keys():\n",
      "    rating[key] = []\n",
      "    if (sum(denom[key]) == 0):\n",
      "        continue\n",
      "    else:\n",
      "        b = data[user_id].mean() + (sum(num[key])/sum(denom[key]))\n",
      "        rating[key].append(b)\n",
      "       \n",
      "sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 69,
       "text": [
        "[('641: Requiem for a Dream (2000)', [5.9000000000000004]),\n",
        " ('603: The Matrix (1999)', [5.5455667767774663]),\n",
        " ('105: Back to the Future (1985)', [5.5005845132854665]),\n",
        " ('155: The Dark Knight (2008)', [5.3122067445241967]),\n",
        " ('121: The Lord of the Rings: The Two Towers (2002)', [5.3065590950228403])]"
       ]
      }
     ],
     "prompt_number": 69
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pandas as pd\n",
	"import numpy as np"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"!head -n 4 data/recsys-datawa4.csv"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\"\",\"1648\",\"5136\",\"918\",\"2824\",\"3867\",\"860\",\"3712\",\"2968\",\"3525\",\"4323\",\"3617\",\"4360\",\"2756\",\"89\",\"442\",\"3556\",\"5261\",\"2492\",\"5062\",\"2486\",\"4942\",\"2267\",\"4809\",\"3853\",\"2288\"\r\n",
	"\"11: Star Wars: Episode IV - A New Hope (1977)\",,4.5,5,4.5,4,4,,5,4,5,,4,,4,3,4,,4.5,4,3.5,,,,,\r\n",
	"\"12: Finding Nemo (2003)\",,5,5,,4,4,4.5,4.5,4,5,,4,5,4.5,,4,,3.5,4,2,3.5,,,,3.5\r\n",
	"\"13: Forrest Gump (1994)\",,5,4.5,5,4.5,4.5,,5,4.5,5,5,4.5,4.5,5,3,4,5,3.5,4.5,4.5,4,3.5,4.5,3.5,3.5\r\n"
	]
	}
	],
	"prompt_number": 70
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data = pd.read_csv('data/recsys-datawa4.csv', index_col=0, header=0)\n",
	"data['3712']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 55,
	"text": [
	"11: Star Wars: Episode IV - A New Hope (1977) NaN\n",
	"12: Finding Nemo (2003) 4.5\n",
	"13: Forrest Gump (1994) NaN\n",
	"14: American Beauty (1999) 4.5\n",
	"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) NaN\n",
	"24: Kill Bill: Vol. 1 (2003) NaN\n",
	"38: Eternal Sunshine of the Spotless Mind (2004) NaN\n",
	"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) NaN\n",
	"77: Memento (2000) NaN\n",
	"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) NaN\n",
	"98: Gladiator (2000) NaN\n",
	"105: Back to the Future (1985) NaN\n",
	"107: Snatch (2000) NaN\n",
	"114: Pretty Woman (1990) 4.0\n",
	"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n",
	"...\n",
	"3049: Ace Ventura: Pet Detective (1994) 5.0\n",
	"4327: Charlie's Angels (2000) NaN\n",
	"5503: The Fugitive (1993) 4.0\n",
	"7443: Chicken Run (2000) 4.0\n",
	"8358: Cast Away (2000) NaN\n",
	"8467: Dumb & Dumber (1994) 4.5\n",
	"8587: The Lion King (1994) NaN\n",
	"9331: Clear and Present Danger (1994) NaN\n",
	"9741: Unbreakable (2000) NaN\n",
	"9802: The Rock (1996) 4.0\n",
	"9806: The Incredibles (2004) 4.5\n",
	"10020: Beauty and the Beast (1991) NaN\n",
	"36657: X-Men (2000) 4.5\n",
	"36658: X2: X-Men United (2003) 4.5\n",
	"36955: True Lies (1994) 4.0\n",
	"Name: 3712, Length: 100, dtype: float64"
	]
	}
	],
	"prompt_number": 55
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data.columns"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 4,
	"text": [
	"Index([u'1648', u'5136', u'918', u'2824', u'3867', u'860', u'3712', u'2968', u'3525', u'4323', u'3617', u'4360', u'2756', u'89', u'442', u'3556', u'5261', u'2492', u'5062', u'2486', u'4942', u'2267', u'4809', u'3853', u'2288'], dtype=object)"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Calculate the user-user correlations:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"resultcor = pd.DataFrame({}, index = data.columns)\n",
	"for col in data.columns:\n",
	" result = []\n",
	" for i in range(25):\n",
	" result.append(data[col].corr(data.ix[:,i]))\n",
	" resultcor[col] = result\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The top 5 users for a particular user_id. (To calculate for others just change the user_id variable.)"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"user_id = '3712'\n",
	"top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
	"top5"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 56,
	"text": [
	"2824 0.462910\n",
	"3867 0.400275\n",
	"5062 0.247693\n",
	"442 0.227130\n",
	"3853 0.193660\n",
	"Name: 3712, dtype: float64"
	]
	}
	],
	"prompt_number": 56
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Fill the NaN values with 0 so that they can be excluded from the calculations."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data1 = data.fillna(0)\n",
	"data1['3712']"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 63,
	"text": [
	"11: Star Wars: Episode IV - A New Hope (1977) 0.0\n",
	"12: Finding Nemo (2003) 4.5\n",
	"13: Forrest Gump (1994) 0.0\n",
	"14: American Beauty (1999) 4.5\n",
	"22: Pirates of the Caribbean: The Curse of the Black Pearl (2003) 0.0\n",
	"24: Kill Bill: Vol. 1 (2003) 0.0\n",
	"38: Eternal Sunshine of the Spotless Mind (2004) 0.0\n",
	"63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995) 0.0\n",
	"77: Memento (2000) 0.0\n",
	"85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 0.0\n",
	"98: Gladiator (2000) 0.0\n",
	"105: Back to the Future (1985) 0.0\n",
	"107: Snatch (2000) 0.0\n",
	"114: Pretty Woman (1990) 4.0\n",
	"120: The Lord of the Rings: The Fellowship of the Ring (2001) 5.0\n",
	"...\n",
	"3049: Ace Ventura: Pet Detective (1994) 5.0\n",
	"4327: Charlie's Angels (2000) 0.0\n",
	"5503: The Fugitive (1993) 4.0\n",
	"7443: Chicken Run (2000) 4.0\n",
	"8358: Cast Away (2000) 0.0\n",
	"8467: Dumb & Dumber (1994) 4.5\n",
	"8587: The Lion King (1994) 0.0\n",
	"9331: Clear and Present Danger (1994) 0.0\n",
	"9741: Unbreakable (2000) 0.0\n",
	"9802: The Rock (1996) 4.0\n",
	"9806: The Incredibles (2004) 4.5\n",
	"10020: Beauty and the Beast (1991) 0.0\n",
	"36657: X-Men (2000) 4.5\n",
	"36658: X2: X-Men United (2003) 4.5\n",
	"36955: True Lies (1994) 4.0\n",
	"Name: 3712, Length: 100, dtype: float64"
	]
	}
	],
	"prompt_number": 63
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Calculate the numerator & denominator values separately. For numerator, iterate over the 100 movies and for each i-th movie iterate over the top 5 user's (top5.index) ratings (data1[index][i]) for that i-th movie and multiplying it with the top 5 user's correlation (top5[index]). For denominator just append the top 5 users correlations. "
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"num = {}\n",
	"denom = {}\n",
	"for i in range(100):\n",
	" num[data1.index[i]]= []\n",
	" denom[data1.index[i]] = []\n",
	" for index in top5.index:\n",
	" if (data1[index][i] == 0):\n",
	" continue\n",
	" else:\n",
	" num[data1.index[i]].append(data1[index][i]*top5[index])\n",
	" denom[data1.index[i]].append(top5[index])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 64
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Calculate the rating for each of the 100 movies as found in denom.keys()"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"rating = {}\n",
	"for key in denom.keys():\n",
	" rating[key] = []\n",
	" if (sum(denom[key]) == 0):\n",
	" continue\n",
	" else:\n",
	" rating[key].append(sum(num[key])/sum(denom[key]))\n",
	" \n",
	"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 65,
	"text": [
	"[('641: Requiem for a Dream (2000)', [4.9999999999999991]),\n",
	" ('603: The Matrix (1999)', [4.855924039087955]),\n",
	" ('105: Back to the Future (1985)', [4.7391731227107821]),\n",
	" ('107: Snatch (2000)', [4.6514324313400932]),\n",
	" ('155: The Dark Knight (2008)', [4.6225640068346863])]"
	]
	}
	],
	"prompt_number": 65
	},
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Normalization"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"user_id = '3712'\n",
	"top5 = resultcor[user_id].order(ascending=False)[1:6]\n",
	"top5"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 66,
	"text": [
	"2824 0.462910\n",
	"3867 0.400275\n",
	"5062 0.247693\n",
	"442 0.227130\n",
	"3853 0.193660\n",
	"Name: 3712, dtype: float64"
	]
	}
	],
	"prompt_number": 66
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"data1 = data.fillna(0)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 67
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"num = {}\n",
	"denom = {}\n",
	"for i in range(100):\n",
	" num[data1.index[i]]= []\n",
	" denom[data1.index[i]] = []\n",
	" for index in top5.index:\n",
	" if (data1[index][i] == 0):\n",
	" continue\n",
	" else:\n",
	" a = (data1[index][i]-data[index].mean())*top5[index]\n",
	" num[data1.index[i]].append(a)\n",
	" denom[data1.index[i]].append(top5[index])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 68
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"rating = {}\n",
	"for key in denom.keys():\n",
	" rating[key] = []\n",
	" if (sum(denom[key]) == 0):\n",
	" continue\n",
	" else:\n",
	" b = data[user_id].mean() + (sum(num[key])/sum(denom[key]))\n",
	" rating[key].append(b)\n",
	" \n",
	"sorted(rating.items(), key = lambda (key,val): val, reverse=True)[:5]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 69,
	"text": [
	"[('641: Requiem for a Dream (2000)', [5.9000000000000004]),\n",
	" ('603: The Matrix (1999)', [5.5455667767774663]),\n",
	" ('105: Back to the Future (1985)', [5.5005845132854665]),\n",
	" ('155: The Dark Knight (2008)', [5.3122067445241967]),\n",
	" ('121: The Lord of the Rings: The Two Towers (2002)', [5.3065590950228403])]"
	]
	}
	],
	"prompt_number": 69
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}