Created
July 25, 2013 20:52
-
-
Save aboSamoor/6083650 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Features-Copy0" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.datasets import fetch_20newsgroups\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.feature_extraction.text import TfidfTransformer\n", | |
"\n", | |
"\n", | |
"categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n", | |
"twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)\n", | |
"twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)\n", | |
"\n", | |
"count_vect = CountVectorizer().fit(twenty_train.data)\n", | |
"X_train_counts = count_vect.transform(twenty_train.data)\n", | |
"Y_test_counts = count_vect.transform(twenty_test.data)\n", | |
"\n", | |
"tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)\n", | |
"X_train_tf = tf_transformer.transform(X_train_counts)\n", | |
"Y_test_tf = tf_transformer.transform(Y_test_counts)\n", | |
"\n", | |
"X_tfidf_ = TfidfTransformer(use_idf=True).fit(X_train_counts)\n", | |
"X_train_tfidf = X_tfidf_.transform(X_train_counts)\n", | |
"Y_test_tfidf = X_tfidf_.transform(Y_test_counts)\n", | |
"\n", | |
"print X_train_counts.shape\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"(2257, 18494)\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn import metrics\n", | |
"\n", | |
"reversed_vocab = {v:k for k,v in count_vect.vocabulary_.iteritems()}\n", | |
"\n", | |
"clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-5, n_iter=5)\n", | |
"clf.fit(X_train_tfidf, twenty_train.target)\n", | |
"predicted = clf.predict(Y_test_tfidf)\n", | |
"\n", | |
"\n", | |
"print metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)\n", | |
"feature_names = np.asarray(count_vect.get_feature_names())\n", | |
"for i, category in enumerate(twenty_train.target_names):\n", | |
" if len(twenty_train.target_names) == 2 and i == 1:\n", | |
" continue\n", | |
" class_coef = clf.coef_[i]\n", | |
" indices = np.argsort(abs(class_coef))\n", | |
" weights = class_coef[indices]\n", | |
" print \n", | |
" print \"*\" * 40\n", | |
" print category\n", | |
" j = 0 \n", | |
" for k, (weight, index) in enumerate(reversed(zip(weights, indices))):\n", | |
" if k > 30:\n", | |
" break\n", | |
" if weight != 0:\n", | |
" f = feature_names[index]\n", | |
" print f, weight" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" alt.atheism 0.93 0.84 0.88 319\n", | |
" comp.graphics 0.92 0.96 0.94 389\n", | |
" sci.med 0.95 0.92 0.93 396\n", | |
"soc.religion.christian 0.89 0.95 0.92 398\n", | |
"\n", | |
" avg / total 0.92 0.92 0.92 1502\n", | |
"\n", | |
"\n", | |
"****************************************\n", | |
"alt.atheism\n", | |
"atheism 16.7760541382\n", | |
"keith 12.550337349\n", | |
"rutgers -10.9520680884\n", | |
"islamic 10.5068309563\n", | |
"atheists 10.4078644957\n", | |
"rushdie 9.57782636361\n", | |
"morality 9.42468112523\n", | |
"moral 9.21028931502\n", | |
"umd 9.1962134749\n", | |
"church -9.15715192289\n", | |
"mathew 8.83429779925\n", | |
"christ -8.45800782681\n", | |
"thanks -8.42171306628\n", | |
"kmr4 8.22796487656\n", | |
"so 7.86312428944\n", | |
"christians -7.85892815499\n", | |
"wingate 7.62471260906\n", | |
"clh -7.58749581812\n", | |
"matthew 7.50076415621\n", | |
"satan 7.44317351953\n", | |
"may -7.43875109544\n", | |
"psuvm 7.37657804559\n", | |
"evil 7.36177472187\n", | |
"okcforum 7.28853265604\n", | |
"rights 7.10098361628\n", | |
"msg -7.05539802231\n", | |
"mangoe 7.01350500577\n", | |
"lippard 6.98419228055\n", | |
"hiv -6.89484933605\n", | |
"islam 6.8939270716\n", | |
"liar 6.87284221869\n", | |
"\n", | |
"****************************************\n", | |
"comp.graphics\n", | |
"graphics 19.6033446295\n", | |
"tiff 13.2556855151\n", | |
"points 12.3648095777\n", | |
"image 12.03755074\n", | |
"images 11.2724491309\n", | |
"files 10.5224717397\n", | |
"virtual 9.79310808774\n", | |
"sphere 9.74141168702\n", | |
"software 9.43607720443\n", | |
"3d 9.43030449818\n", | |
"video 9.39755744296\n", | |
"3do 9.15621005795\n", | |
"keyboard -8.97735885939\n", | |
"42 8.91233821226\n", | |
"my -8.62256224259\n", | |
"file 8.57653671527\n", | |
"code 8.06541542424\n", | |
"animation 7.87315370552\n", | |
"computer 7.82563892902\n", | |
"of -7.79528550243\n", | |
"windows 7.77685841745\n", | |
"color 7.74937871162\n", | |
"cview 7.74195414079\n", | |
"fractal 7.6256809391\n", | |
"version 7.60649467542\n", | |
"polygon 7.59768039088\n", | |
"god -7.35574337651\n", | |
"card 7.21948844075\n", | |
"people -7.1898247233\n", | |
"pov 7.08769076909\n", | |
"renderman 7.0465853062\n", | |
"\n", | |
"****************************************\n", | |
"sci.med\n", | |
"god" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" -14.4090759082\n", | |
"graphics -13.3393730028\n", | |
"msg 12.8837182415\n", | |
"doctor 12.521083914\n", | |
"health 11.6132848839\n", | |
"pitt 11.1759190223\n", | |
"treatment 10.9239130307\n", | |
"cancer 10.4735048784\n", | |
"disease 9.96999306506\n", | |
"photography 9.81282821773\n", | |
"medicine 9.32067461869\n", | |
"keyboard 9.01804427801\n", | |
"homeopathy 8.9226642377\n", | |
"medical 8.91824116311\n", | |
"pain 8.58087053796\n", | |
"christian -8.53992157548\n", | |
"christians -8.39324456306\n", | |
"information 7.6581907484\n", | |
"syndrome 7.65628518432\n", | |
"images -7.65426706103\n", | |
"lyme 7.50497315246\n", | |
"med 7.45695710359\n", | |
"jesus -7.43655963838\n", | |
"files -7.41462695908\n", | |
"church -7.3512966304\n", | |
"counselor 7.0693139276\n", | |
"video -7.05398794756\n", | |
"religion -6.96308676999\n", | |
"gordon 6.90038326594\n", | |
"hernia 6.88176522169\n", | |
"diabetes 6.86070243644\n", | |
"\n", | |
"****************************************\n", | |
"soc.religion.christian\n", | |
"church 17.4615613363\n", | |
"rutgers 17.3452667461\n", | |
"christians 16.4093744448\n", | |
"host -15.5884150929\n", | |
"nntp -15.0761008507\n", | |
"clh 14.6944153319\n", | |
"christian 14.2831937151\n", | |
"atheism -13.8817598606\n", | |
"athos 13.283608912\n", | |
"christ 13.0571310916\n", | |
"posting -12.9538003512\n", | |
"heaven 11.0083933286\n", | |
"may 9.56282761613\n", | |
"god 9.51992552803\n", | |
"graphics -9.17919322856\n", | |
"christianity 9.08095740539\n", | |
"bassili 9.01522081702\n", | |
"authority 8.848689572\n", | |
"sin 8.76008305028\n", | |
"easter 8.71452678808\n", | |
"keith -8.63029348337\n", | |
"hell 8.48633449604\n", | |
"arrogance 8.35121496492\n", | |
"scripture 8.08100002546\n", | |
"article -7.99843785333\n", | |
"matthew -7.97344367532\n", | |
"apr 7.93406529413\n", | |
"catholic 7.85597441708\n", | |
"black 7.803324629\n", | |
"jayne 7.48732000941\n", | |
"geneva 7.32659892131\n" | |
] | |
} | |
], | |
"prompt_number": 78 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment