bguzryanto · December 31, 2015 15:03
diff --git a/notebook.ipynb b/notebook.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "\n",
    "# algoritma svm\n",
    "from sklearn import svm\n",
    "\n",
    "from sklearn.cross_validation import StratifiedKFold as skf\n",
    "\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def load_file(limit_class=100000000):\n",
    "    with open('./datasets/31122015_PREPROCESSED_STEMMED_CASEFOLDING_WITHKEYWORD.csv', 'r') as csv_file:\n",
    "        reader = csv_file.read()\n",
    "        reader = reader.split('\\n')\n",
    "        data =[]\n",
    "        target = []\n",
    "        lines = []\n",
    "        for row in reader:\n",
    "            lines.append(row)\n",
    "        \n",
    "        count_positif = 0\n",
    "        count_negatif = 0\n",
    "        class_boundary = limit_class\n",
    "        \n",
    "        for line in lines:    \n",
    "            row = line.split('\\t')\n",
    "            if len(row) == 2 and row[1] != 'undefined':\n",
    "                if (row[1] == 'negative' or row[1] == 'negatif') and count_negatif < class_boundary:\n",
    "                    data.append(row[0])\n",
    "                    target.append(row[1])\n",
    "                    count_negatif += 1\n",
    "                    \n",
    "                if (row[1] == 'positive' or row[1] == 'positif') and count_positif < class_boundary:\n",
    "                    data.append(row[0])\n",
    "                    target.append(row[1])\n",
    "                    count_positif += 1\n",
    "                    \n",
    "        return data,target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def preprocess(data, target):\n",
    "    count_vectorizer = CountVectorizer(binary=True)\n",
    "    data = count_vectorizer.fit_transform(data)\n",
    "    tfidf_data = TfidfTransformer().fit_transform(data)\n",
    "\n",
    "    return tfidf_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def learn_svm(data, target):\n",
    "    svm_clf = svm.SVC(kernel='linear', decision_function_shape='ovr')\n",
    "    svm_clf.fit(data, target)\n",
    "    \n",
    "    # return SVM Model\n",
    "    return svm_clf\n",
    "\n",
    "def learn_model(data,target):\n",
    "    nfolds = 5\n",
    "    skf_data = skf(target, n_folds=nfolds)\n",
    "    \n",
    "    for idxtrain, idxtest in skf_data:\n",
    "        # bagi data train dan data test berdasarkan indeks yang telah dibuat oleh skf\n",
    "        \n",
    "        # train data dan target\n",
    "        train_data = data[idxtrain]\n",
    "        train_target = np.array(target)[idxtrain].tolist()\n",
    "\n",
    "        # test data dan target\n",
    "        test_data = data[idxtest]\n",
    "        test_target = np.array(target)[idxtest].tolist()\n",
    "\n",
    "        # pelajari modelnya\n",
    "        svm_clf = learn_svm(train_data, train_target)\n",
    "\n",
    "        # lakukan prediksi\n",
    "        svm_predicted = svm_clf.predict(test_data)\n",
    "\n",
    "        # evaluasi model\n",
    "        evaluate_model(test_target, svm_predicted)\n",
    "        \n",
    "    \n",
    "    clf = learn_svm(data, target)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def evaluate_model(target_true,target_predicted):\n",
    "    print classification_report(target_true,target_predicted)\n",
    "    print \"The accuracy score is {:.2%}\".format(accuracy_score(target_true,target_predicted))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data,target = load_file()\n",
    "tf_idf = preprocess(data, target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.36      0.19      0.25        77\n",
      "   positive       0.66      0.81      0.73       145\n",
      "\n",
      "avg / total       0.55      0.60      0.56       222\n",
      "\n",
      "The accuracy score is 59.91%\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.50      0.06      0.11        77\n",
      "   positive       0.66      0.97      0.78       144\n",
      "\n",
      "avg / total       0.60      0.65      0.55       221\n",
      "\n",
      "The accuracy score is 65.16%\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.61      0.22      0.33        76\n",
      "   positive       0.69      0.92      0.79       144\n",
      "\n",
      "avg / total       0.66      0.68      0.63       220\n",
      "\n",
      "The accuracy score is 68.18%\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.72      0.28      0.40        76\n",
      "   positive       0.71      0.94      0.81       144\n",
      "\n",
      "avg / total       0.72      0.71      0.67       220\n",
      "\n",
      "The accuracy score is 71.36%\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.82      0.24      0.37        76\n",
      "   positive       0.71      0.97      0.82       144\n",
      "\n",
      "avg / total       0.75      0.72      0.66       220\n",
      "\n",
      "The accuracy score is 71.82%\n"
     ]
    }
   ],
   "source": [
    "learn_model(tf_idf,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false,
	"slideshow": {
	"slide_type": "slide"
	}
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"\n",
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.feature_extraction.text import TfidfTransformer\n",
	"\n",
	"# algoritma svm\n",
	"from sklearn import svm\n",
	"\n",
	"from sklearn.cross_validation import StratifiedKFold as skf\n",
	"\n",
	"from sklearn.metrics import classification_report\n",
	"from sklearn.metrics import accuracy_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def load_file(limit_class=100000000):\n",
	" with open('./datasets/31122015_PREPROCESSED_STEMMED_CASEFOLDING_WITHKEYWORD.csv', 'r') as csv_file:\n",
	" reader = csv_file.read()\n",
	" reader = reader.split('\\n')\n",
	" data =[]\n",
	" target = []\n",
	" lines = []\n",
	" for row in reader:\n",
	" lines.append(row)\n",
	" \n",
	" count_positif = 0\n",
	" count_negatif = 0\n",
	" class_boundary = limit_class\n",
	" \n",
	" for line in lines: \n",
	" row = line.split('\\t')\n",
	" if len(row) == 2 and row[1] != 'undefined':\n",
	" if (row[1] == 'negative' or row[1] == 'negatif') and count_negatif < class_boundary:\n",
	" data.append(row[0])\n",
	" target.append(row[1])\n",
	" count_negatif += 1\n",
	" \n",
	" if (row[1] == 'positive' or row[1] == 'positif') and count_positif < class_boundary:\n",
	" data.append(row[0])\n",
	" target.append(row[1])\n",
	" count_positif += 1\n",
	" \n",
	" return data,target"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def preprocess(data, target):\n",
	" count_vectorizer = CountVectorizer(binary=True)\n",
	" data = count_vectorizer.fit_transform(data)\n",
	" tfidf_data = TfidfTransformer().fit_transform(data)\n",
	"\n",
	" return tfidf_data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def learn_svm(data, target):\n",
	" svm_clf = svm.SVC(kernel='linear', decision_function_shape='ovr')\n",
	" svm_clf.fit(data, target)\n",
	" \n",
	" # return SVM Model\n",
	" return svm_clf\n",
	"\n",
	"def learn_model(data,target):\n",
	" nfolds = 5\n",
	" skf_data = skf(target, n_folds=nfolds)\n",
	" \n",
	" for idxtrain, idxtest in skf_data:\n",
	" # bagi data train dan data test berdasarkan indeks yang telah dibuat oleh skf\n",
	" \n",
	" # train data dan target\n",
	" train_data = data[idxtrain]\n",
	" train_target = np.array(target)[idxtrain].tolist()\n",
	"\n",
	" # test data dan target\n",
	" test_data = data[idxtest]\n",
	" test_target = np.array(target)[idxtest].tolist()\n",
	"\n",
	" # pelajari modelnya\n",
	" svm_clf = learn_svm(train_data, train_target)\n",
	"\n",
	" # lakukan prediksi\n",
	" svm_predicted = svm_clf.predict(test_data)\n",
	"\n",
	" # evaluasi model\n",
	" evaluate_model(test_target, svm_predicted)\n",
	" \n",
	" \n",
	" clf = learn_svm(data, target)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def evaluate_model(target_true,target_predicted):\n",
	" print classification_report(target_true,target_predicted)\n",
	" print \"The accuracy score is {:.2%}\".format(accuracy_score(target_true,target_predicted))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data,target = load_file()\n",
	"tf_idf = preprocess(data, target)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" negative 0.36 0.19 0.25 77\n",
	" positive 0.66 0.81 0.73 145\n",
	"\n",
	"avg / total 0.55 0.60 0.56 222\n",
	"\n",
	"The accuracy score is 59.91%\n",
	" precision recall f1-score support\n",
	"\n",
	" negative 0.50 0.06 0.11 77\n",
	" positive 0.66 0.97 0.78 144\n",
	"\n",
	"avg / total 0.60 0.65 0.55 221\n",
	"\n",
	"The accuracy score is 65.16%\n",
	" precision recall f1-score support\n",
	"\n",
	" negative 0.61 0.22 0.33 76\n",
	" positive 0.69 0.92 0.79 144\n",
	"\n",
	"avg / total 0.66 0.68 0.63 220\n",
	"\n",
	"The accuracy score is 68.18%\n",
	" precision recall f1-score support\n",
	"\n",
	" negative 0.72 0.28 0.40 76\n",
	" positive 0.71 0.94 0.81 144\n",
	"\n",
	"avg / total 0.72 0.71 0.67 220\n",
	"\n",
	"The accuracy score is 71.36%\n",
	" precision recall f1-score support\n",
	"\n",
	" negative 0.82 0.24 0.37 76\n",
	" positive 0.71 0.97 0.82 144\n",
	"\n",
	"avg / total 0.75 0.72 0.66 220\n",
	"\n",
	"The accuracy score is 71.82%\n"
	]
	}
	],
	"source": [
	"learn_model(tf_idf,target)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}