Skip to content

Instantly share code, notes, and snippets.

@bguzryanto
Created December 31, 2015 15:03
Show Gist options
  • Save bguzryanto/cd4c22104e3ce6a95a92 to your computer and use it in GitHub Desktop.
Save bguzryanto/cd4c22104e3ce6a95a92 to your computer and use it in GitHub Desktop.
Sentiment Analisis
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"\n",
"# algoritma svm\n",
"from sklearn import svm\n",
"\n",
"from sklearn.cross_validation import StratifiedKFold as skf\n",
"\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def load_file(limit_class=100000000):\n",
" with open('./datasets/31122015_PREPROCESSED_STEMMED_CASEFOLDING_WITHKEYWORD.csv', 'r') as csv_file:\n",
" reader = csv_file.read()\n",
" reader = reader.split('\\n')\n",
" data =[]\n",
" target = []\n",
" lines = []\n",
" for row in reader:\n",
" lines.append(row)\n",
" \n",
" count_positif = 0\n",
" count_negatif = 0\n",
" class_boundary = limit_class\n",
" \n",
" for line in lines: \n",
" row = line.split('\\t')\n",
" if len(row) == 2 and row[1] != 'undefined':\n",
" if (row[1] == 'negative' or row[1] == 'negatif') and count_negatif < class_boundary:\n",
" data.append(row[0])\n",
" target.append(row[1])\n",
" count_negatif += 1\n",
" \n",
" if (row[1] == 'positive' or row[1] == 'positif') and count_positif < class_boundary:\n",
" data.append(row[0])\n",
" target.append(row[1])\n",
" count_positif += 1\n",
" \n",
" return data,target"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def preprocess(data, target):\n",
" count_vectorizer = CountVectorizer(binary=True)\n",
" data = count_vectorizer.fit_transform(data)\n",
" tfidf_data = TfidfTransformer().fit_transform(data)\n",
"\n",
" return tfidf_data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def learn_svm(data, target):\n",
" svm_clf = svm.SVC(kernel='linear', decision_function_shape='ovr')\n",
" svm_clf.fit(data, target)\n",
" \n",
" # return SVM Model\n",
" return svm_clf\n",
"\n",
"def learn_model(data,target):\n",
" nfolds = 5\n",
" skf_data = skf(target, n_folds=nfolds)\n",
" \n",
" for idxtrain, idxtest in skf_data:\n",
" # bagi data train dan data test berdasarkan indeks yang telah dibuat oleh skf\n",
" \n",
" # train data dan target\n",
" train_data = data[idxtrain]\n",
" train_target = np.array(target)[idxtrain].tolist()\n",
"\n",
" # test data dan target\n",
" test_data = data[idxtest]\n",
" test_target = np.array(target)[idxtest].tolist()\n",
"\n",
" # pelajari modelnya\n",
" svm_clf = learn_svm(train_data, train_target)\n",
"\n",
" # lakukan prediksi\n",
" svm_predicted = svm_clf.predict(test_data)\n",
"\n",
" # evaluasi model\n",
" evaluate_model(test_target, svm_predicted)\n",
" \n",
" \n",
" clf = learn_svm(data, target)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def evaluate_model(target_true,target_predicted):\n",
" print classification_report(target_true,target_predicted)\n",
" print \"The accuracy score is {:.2%}\".format(accuracy_score(target_true,target_predicted))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data,target = load_file()\n",
"tf_idf = preprocess(data, target)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" negative 0.36 0.19 0.25 77\n",
" positive 0.66 0.81 0.73 145\n",
"\n",
"avg / total 0.55 0.60 0.56 222\n",
"\n",
"The accuracy score is 59.91%\n",
" precision recall f1-score support\n",
"\n",
" negative 0.50 0.06 0.11 77\n",
" positive 0.66 0.97 0.78 144\n",
"\n",
"avg / total 0.60 0.65 0.55 221\n",
"\n",
"The accuracy score is 65.16%\n",
" precision recall f1-score support\n",
"\n",
" negative 0.61 0.22 0.33 76\n",
" positive 0.69 0.92 0.79 144\n",
"\n",
"avg / total 0.66 0.68 0.63 220\n",
"\n",
"The accuracy score is 68.18%\n",
" precision recall f1-score support\n",
"\n",
" negative 0.72 0.28 0.40 76\n",
" positive 0.71 0.94 0.81 144\n",
"\n",
"avg / total 0.72 0.71 0.67 220\n",
"\n",
"The accuracy score is 71.36%\n",
" precision recall f1-score support\n",
"\n",
" negative 0.82 0.24 0.37 76\n",
" positive 0.71 0.97 0.82 144\n",
"\n",
"avg / total 0.75 0.72 0.66 220\n",
"\n",
"The accuracy score is 71.82%\n"
]
}
],
"source": [
"learn_model(tf_idf,target)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment