Created
December 31, 2015 15:03
-
-
Save bguzryanto/cd4c22104e3ce6a95a92 to your computer and use it in GitHub Desktop.
Sentiment Analisis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false, | |
"slideshow": { | |
"slide_type": "slide" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.feature_extraction.text import TfidfTransformer\n", | |
"\n", | |
"# algoritma svm\n", | |
"from sklearn import svm\n", | |
"\n", | |
"from sklearn.cross_validation import StratifiedKFold as skf\n", | |
"\n", | |
"from sklearn.metrics import classification_report\n", | |
"from sklearn.metrics import accuracy_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def load_file(limit_class=100000000):\n", | |
" with open('./datasets/31122015_PREPROCESSED_STEMMED_CASEFOLDING_WITHKEYWORD.csv', 'r') as csv_file:\n", | |
" reader = csv_file.read()\n", | |
" reader = reader.split('\\n')\n", | |
" data =[]\n", | |
" target = []\n", | |
" lines = []\n", | |
" for row in reader:\n", | |
" lines.append(row)\n", | |
" \n", | |
" count_positif = 0\n", | |
" count_negatif = 0\n", | |
" class_boundary = limit_class\n", | |
" \n", | |
" for line in lines: \n", | |
" row = line.split('\\t')\n", | |
" if len(row) == 2 and row[1] != 'undefined':\n", | |
" if (row[1] == 'negative' or row[1] == 'negatif') and count_negatif < class_boundary:\n", | |
" data.append(row[0])\n", | |
" target.append(row[1])\n", | |
" count_negatif += 1\n", | |
" \n", | |
" if (row[1] == 'positive' or row[1] == 'positif') and count_positif < class_boundary:\n", | |
" data.append(row[0])\n", | |
" target.append(row[1])\n", | |
" count_positif += 1\n", | |
" \n", | |
" return data,target" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def preprocess(data, target):\n", | |
" count_vectorizer = CountVectorizer(binary=True)\n", | |
" data = count_vectorizer.fit_transform(data)\n", | |
" tfidf_data = TfidfTransformer().fit_transform(data)\n", | |
"\n", | |
" return tfidf_data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def learn_svm(data, target):\n", | |
" svm_clf = svm.SVC(kernel='linear', decision_function_shape='ovr')\n", | |
" svm_clf.fit(data, target)\n", | |
" \n", | |
" # return SVM Model\n", | |
" return svm_clf\n", | |
"\n", | |
"def learn_model(data,target):\n", | |
" nfolds = 5\n", | |
" skf_data = skf(target, n_folds=nfolds)\n", | |
" \n", | |
" for idxtrain, idxtest in skf_data:\n", | |
" # bagi data train dan data test berdasarkan indeks yang telah dibuat oleh skf\n", | |
" \n", | |
" # train data dan target\n", | |
" train_data = data[idxtrain]\n", | |
" train_target = np.array(target)[idxtrain].tolist()\n", | |
"\n", | |
" # test data dan target\n", | |
" test_data = data[idxtest]\n", | |
" test_target = np.array(target)[idxtest].tolist()\n", | |
"\n", | |
" # pelajari modelnya\n", | |
" svm_clf = learn_svm(train_data, train_target)\n", | |
"\n", | |
" # lakukan prediksi\n", | |
" svm_predicted = svm_clf.predict(test_data)\n", | |
"\n", | |
" # evaluasi model\n", | |
" evaluate_model(test_target, svm_predicted)\n", | |
" \n", | |
" \n", | |
" clf = learn_svm(data, target)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def evaluate_model(target_true,target_predicted):\n", | |
" print classification_report(target_true,target_predicted)\n", | |
" print \"The accuracy score is {:.2%}\".format(accuracy_score(target_true,target_predicted))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data,target = load_file()\n", | |
"tf_idf = preprocess(data, target)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" negative 0.36 0.19 0.25 77\n", | |
" positive 0.66 0.81 0.73 145\n", | |
"\n", | |
"avg / total 0.55 0.60 0.56 222\n", | |
"\n", | |
"The accuracy score is 59.91%\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" negative 0.50 0.06 0.11 77\n", | |
" positive 0.66 0.97 0.78 144\n", | |
"\n", | |
"avg / total 0.60 0.65 0.55 221\n", | |
"\n", | |
"The accuracy score is 65.16%\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" negative 0.61 0.22 0.33 76\n", | |
" positive 0.69 0.92 0.79 144\n", | |
"\n", | |
"avg / total 0.66 0.68 0.63 220\n", | |
"\n", | |
"The accuracy score is 68.18%\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" negative 0.72 0.28 0.40 76\n", | |
" positive 0.71 0.94 0.81 144\n", | |
"\n", | |
"avg / total 0.72 0.71 0.67 220\n", | |
"\n", | |
"The accuracy score is 71.36%\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" negative 0.82 0.24 0.37 76\n", | |
" positive 0.71 0.97 0.82 144\n", | |
"\n", | |
"avg / total 0.75 0.72 0.66 220\n", | |
"\n", | |
"The accuracy score is 71.82%\n" | |
] | |
} | |
], | |
"source": [ | |
"learn_model(tf_idf,target)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment