Created
February 11, 2017 20:04
-
-
Save jmsword/61b669321a8f17845ce2a1965d8554af to your computer and use it in GitHub Desktop.
Random Forest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.ensemble import RandomForestClassifier | |
import sklearn.metrics as skm | |
import pylab as pl | |
#Read in the column names for the dataset | |
feat = pd.read_csv('features.txt', delim_whitespace=True, header=None, index_col=False) | |
#Name the columns to isolate all dataset column names in one column | |
feat.columns=['column 1', 'column 2'] | |
#Clean up column names | |
feat['column 2'] = feat['column 2'].str.replace('-', '') | |
feat['column 2'] = feat['column 2'].str.replace('(', '') | |
feat['column 2'] = feat['column 2'].str.replace(')', '') | |
feat['column 2'] = feat['column 2'].str.replace(',', '') | |
feat['column 2'] = feat['column 2'].str.replace('BodyBody', '') | |
feat['column 2'] = feat['column 2'].str.replace('Body', '') | |
feat['column 2'] = feat['column 2'].str.replace('Mag', '') | |
feat['column 2'] = feat['column 2'].str.replace('mean', 'Mean') | |
feat['column 2'] = feat['column 2'].str.replace('std', 'STD') | |
#Read in raw data | |
#The 'y_train' and 'subject_train' files are in ASCII character format | |
X_train = pd.read_csv('train/X_train.txt', delim_whitespace=True, header=None, index_col=False) | |
y_train = pd.read_csv('train/y_train.txt', header=None, index_col=False) | |
subjects = pd.read_csv('train/subject_train.txt', header=None, index_col=False) | |
#Isolate clean column names from 'feat' DataFrame | |
features=feat['column 2'] | |
## Assign column names to each raw data file | |
X_train.columns = features | |
y_train.columns = ['Activity'] | |
subjects.columns = ['Subject'] | |
#Remove duplicate columns from X_train file | |
X_train = X_train.loc[:,~X_train.columns.duplicated()] | |
#merge all dataframes together | |
data = pd.merge(y_train, X_train, left_index=True, right_index=True) | |
data = pd.merge(data, subjects, left_index=True, right_index=True) | |
#make 'Activity' a categorical variable | |
data['Activity'] = pd.Categorical(data['Activity']).codes | |
#Separate testing and training data | |
train = data.query('Subject >= 27') | |
test = data.query('Subject <= 6') | |
validation = data.query('(Subject >= 21) & (Subject < 27)') | |
#Fit random forest to training set | |
train_target = train['Activity'] | |
#Remove unnecessary columns | |
train_data = train.ix[:,1:-2] | |
rfc = RandomForestClassifier(n_estimators=50, oob_score=True) | |
rfc.fit(train_data, train_target) | |
print('Out of the box score: ', rfc.oob_score_) | |
#Most important features | |
importances = rfc.feature_importances_ | |
indices = np.argsort(importances)[::-1] | |
print('Top 10 Features:') | |
for i in range(10): | |
print("%d. feature %d (%f)" % (i + 1, indices[i], importances[indices[i]])) | |
#Not sure how to get the column names to print, rather than their index | |
#Define validation set and make accuracy score predictions | |
val_target = validation['Activity'] | |
#Remove unnecessary columns | |
val_data = validation.ix[:,1:-2] | |
#Calculate accuracy score | |
val_pred = rfc.predict(val_data) | |
#Define test set and make accuracy score predictions | |
test_target = test['Activity'] | |
#Remove unnecessary columns | |
test_data = test.ix[:,1:-2] | |
#Calculate accuracy score | |
test_pred = rfc.predict(test_data) | |
#Print accuracy scores | |
print("Validation set Mean Accuracy Score: %f" % (rfc.score(val_data, val_target))) | |
print("Test set Mean Accuracy Score: %f" % (rfc.score(test_data, test_target))) | |
#Confusion matrix | |
test_cm = skm.confusion_matrix(test_target, test_pred) | |
pl.matshow(test_cm) | |
pl.title('Test data confusion matrix') | |
pl.colorbar() | |
pl.show() | |
#Print accuracy, precision, recall, and f1 scores for test set | |
print("Accuracy = %f" %(skm.accuracy_score(test_target, test_pred))) | |
print("Precision = %f" %(skm.precision_score(test_target, test_pred, average='weighted'))) | |
print("Recall = %f" %(skm.recall_score(test_target, test_pred, average='weighted'))) | |
print("F1 score = %f" %(skm.f1_score(test_target, test_pred, average='weighted'))) | |
###### Note: I had to work off of another students project to figure this assignmant out fully. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment