aloknayak29 · December 12, 2014 06:53
diff --git a/sentiment_analysis.ipynb b/sentiment_analysis.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:aae6039d9a9d6c7c624b92eeb730fc1a851a50068870318acb4b0252fe12c0fc"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nltk"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#making data\n",
      "from nltk.corpus import movie_reviews\n",
      "documents = [(list(movie_reviews.words(fileid)), category)\n",
      "            for category in movie_reviews.categories()\n",
      "            for fileid in movie_reviews.fileids(category)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import random\n",
      "random.shuffle(documents)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#features extraction\n",
      "import nltk\n",
      "all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())\n",
      "word_features = all_words.keys()[:2000]\n",
      "\n",
      "def document_features(document):\n",
      "    document_words = set(document)\n",
      "    features = {}\n",
      "    for word in word_features:\n",
      "        features['contains(%s)' % word] = (word in document_words)\n",
      "    return features"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "featuresets = [(document_features(d), c) for (d,c) in documents]\n",
      "train_set, test_set = featuresets[100:], featuresets[:100]\n",
      "#additional you can divide train,test set equally splitting categories\n",
      "classifier = nltk.NaiveBayesClassifier.train(train_set)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from nltk import word_tokenize\n",
      "classifier.classify(document_features(word_tokenize(\"\"\"\n",
      "Though I quite enjoyed their original outing, second time around for our woebegone heroes is a horrible dud. A better title would be \"Horrible Dudes, Too\", or better yet, \"Horrible Tools To...\". On the plus side,the camaraderie between the trio is intact; Aniston and Spacey do not disappoint in some deliciously villainous scenes; Foxx brings much needed braininess; and Pine and Waltz are both welcome additions (though the latter is criminally underutilized). Unfortunately, while there are some decent - or indecent! - gags along the way, this sequel is about as sophomoric as they come and shamefully lame given the calibre of the actors and the size of the film's budget. Beyond the clich\u00e9s and weak story arc, much of the banter isn't that clever or funny. It's too bad that in their haste to strike while the market is hot, the producers and script doctors so compromised quality, settling for a connect-the-dot storyline that looks like it was developed by a bunch of fifth-graders. While good comedy can come from the most simple ideas and should be edgy, this movie frequently peddles the cheapest brand of chuckles by offensive stereotypes and degrading treatment of Latinos, Asians, and African-Americans. The gag reel at the end was the highlight of my film experience.\n",
      "\"\"\")))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "'neg'"
       ]
      }
     ],
     "prompt_number": 8
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "",
	"signature": "sha256:aae6039d9a9d6c7c624b92eeb730fc1a851a50068870318acb4b0252fe12c0fc"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import nltk"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"#making data\n",
	"from nltk.corpus import movie_reviews\n",
	"documents = [(list(movie_reviews.words(fileid)), category)\n",
	" for category in movie_reviews.categories()\n",
	" for fileid in movie_reviews.fileids(category)]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import random\n",
	"random.shuffle(documents)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"#features extraction\n",
	"import nltk\n",
	"all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())\n",
	"word_features = all_words.keys()[:2000]\n",
	"\n",
	"def document_features(document):\n",
	" document_words = set(document)\n",
	" features = {}\n",
	" for word in word_features:\n",
	" features['contains(%s)' % word] = (word in document_words)\n",
	" return features"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"featuresets = [(document_features(d), c) for (d,c) in documents]\n",
	"train_set, test_set = featuresets[100:], featuresets[:100]\n",
	"#additional you can divide train,test set equally splitting categories\n",
	"classifier = nltk.NaiveBayesClassifier.train(train_set)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from nltk import word_tokenize\n",
	"classifier.classify(document_features(word_tokenize(\"\"\"\n",
	"Though I quite enjoyed their original outing, second time around for our woebegone heroes is a horrible dud. A better title would be \"Horrible Dudes, Too\", or better yet, \"Horrible Tools To...\". On the plus side,the camaraderie between the trio is intact; Aniston and Spacey do not disappoint in some deliciously villainous scenes; Foxx brings much needed braininess; and Pine and Waltz are both welcome additions (though the latter is criminally underutilized). Unfortunately, while there are some decent - or indecent! - gags along the way, this sequel is about as sophomoric as they come and shamefully lame given the calibre of the actors and the size of the film's budget. Beyond the clich\u00e9s and weak story arc, much of the banter isn't that clever or funny. It's too bad that in their haste to strike while the market is hot, the producers and script doctors so compromised quality, settling for a connect-the-dot storyline that looks like it was developed by a bunch of fifth-graders. While good comedy can come from the most simple ideas and should be edgy, this movie frequently peddles the cheapest brand of chuckles by offensive stereotypes and degrading treatment of Latinos, Asians, and African-Americans. The gag reel at the end was the highlight of my film experience.\n",
	"\"\"\")))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 8,
	"text": [
	"'neg'"
	]
	}
	],
	"prompt_number": 8
	}
	],
	"metadata": {}
	}
	]
	}