Last active
June 1, 2017 03:32
-
-
Save arthurl/6a85d4486918c970e4c4a7dc9a80deb4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"# Basic imports\n", | |
"\n", | |
"import nltk\n", | |
"from nltk.corpus import movie_reviews\n", | |
"\n", | |
"import random\n", | |
"import string" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"Available categories:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['neg', 'pos']" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"movie_reviews.categories()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"Split data set into train / validate / test sets." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of reviews: 2000\n" | |
] | |
} | |
], | |
"source": [ | |
"reviews = [(list(movie_reviews.words(fileid)), sentiment)\n", | |
" for sentiment in ['neg', 'pos']\n", | |
" for fileid in movie_reviews.fileids(sentiment)]\n", | |
"random.shuffle(reviews)\n", | |
"\n", | |
"print(\"Number of reviews: {}\".format(len(reviews)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"fraction_train, fraction_validation, fraction_test = (0.8, 0.1, 0.1)\n", | |
"\n", | |
"train_set = reviews[:int(fraction_train * len(reviews))]\n", | |
"validation_set = reviews[int(fraction_train * len(reviews))\n", | |
" :int((fraction_train + fraction_validation) * len(reviews))]\n", | |
"test_set = reviews[int((fraction_train + fraction_validation) * len(reviews))\n", | |
" :int((fraction_train + fraction_validation + fraction_test) * len(reviews))]\n", | |
"\n", | |
"del(reviews)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"## Most basic method:" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"Possibly the most basic method that is still gives reasonable performamce:\n", | |
"\n", | |
"Model: Naive bayes.\n", | |
"\n", | |
"Feature(s): Which of the most common 3000 words exist in this text?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"common_words = list(nltk.FreqDist(w.lower()\n", | |
" for review in train_set\n", | |
" for w in filter(lambda x: x not in string.punctuation, review[0])))[:3000]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"def map_feature(feature, input_iter):\n", | |
" return [(feature(words), sentiment) for (words, sentiment) in input_iter]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"def feature_containsword(text):\n", | |
" text_words = set(map(lambda x: x.lower(), text))\n", | |
" return {'exists({})'.format(word): (word in text_words) for word in common_words}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"classifier = nltk.NaiveBayesClassifier.train(map_feature(feature_containsword, train_set))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.79\n" | |
] | |
} | |
], | |
"source": [ | |
"print(nltk.classify.accuracy(classifier, map_feature(feature_containsword, validation_set)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Most Informative Features\n", | |
" exists(finest) = True pos : neg = 8.9 : 1.0\n", | |
" exists(maintains) = True pos : neg = 7.9 : 1.0\n", | |
" exists(outsider) = True pos : neg = 7.4 : 1.0\n", | |
" exists(symbol) = True pos : neg = 7.4 : 1.0\n", | |
" exists(varying) = True pos : neg = 7.4 : 1.0\n", | |
" exists(mulan) = True pos : neg = 7.4 : 1.0\n", | |
" exists(annual) = True pos : neg = 7.4 : 1.0\n", | |
" exists(ugh) = True neg : pos = 7.3 : 1.0\n", | |
" exists(huh) = True neg : pos = 6.9 : 1.0\n", | |
" exists(weaknesses) = True pos : neg = 6.7 : 1.0\n" | |
] | |
} | |
], | |
"source": [ | |
"classifier.show_most_informative_features(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"## Slightly more fancy method" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"Max entropy model: SVM\n", | |
"\n", | |
"Features: Term frequency of word stems in document, ignoring stop words." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"porter = nltk.PorterStemmer()\n", | |
"\n", | |
"# Note: stemming already normalises case to lowercase.\n", | |
"common_words = list(nltk.FreqDist(w\n", | |
" for review in train_set\n", | |
" for w in map(porter.stem,\n", | |
" filter(lambda x: x not in string.punctuation, review[0]))\n", | |
" if w not in nltk.corpus.stopwords.words('english')))[:3000]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"def feature_tf(text):\n", | |
" freq_dist = nltk.FreqDist(w for w in map(porter.stem,\n", | |
" filter(lambda x: x not in string.punctuation, text))\n", | |
" if w not in nltk.corpus.stopwords.words('english'))\n", | |
" return {'tf({})'.format(word): freq_dist.freq(word) for word in common_words}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import sklearn.svm\n", | |
"\n", | |
"classifier = nltk.classify.scikitlearn.SklearnClassifier(sklearn.svm.LinearSVC()).train(map_feature(feature_tf, train_set))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"autoscroll": false, | |
"collapsed": false, | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.735\n" | |
] | |
} | |
], | |
"source": [ | |
"print(nltk.classify.accuracy(classifier, map_feature(feature_tf, validation_set)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ein.tags": "worksheet-0", | |
"slideshow": { | |
"slide-type": "-" | |
} | |
}, | |
"source": [ | |
"Comments:\n", | |
"\n", | |
"1. I would have used tf-idf.\n", | |
"2. It is interesting that the extremely naive method gets higher accuracy..." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
}, | |
"name": "mas.ipynb" | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment