Created
March 1, 2020 10:29
-
-
Save vidit0210/c2f74323c8d2096729000f98ffbee4ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
Missing data - rows | |
--- | |
# Check how many values are missing in the category_desc column | |
print(volunteer["category_desc"].isnull().sum()) | |
# Subset the volunteer dataset | |
volunteer_subset = volunteer[volunteer["category_desc"].notnull()] | |
# Print out the shape of the subset | |
print(volunteer_subset.shape) | |
----- | |
Converting a column type | |
----- | |
# Print the head of the hits column | |
print(volunteer["hits"].head()) | |
# Convert the hits column to type int | |
volunteer["hits"] = volunteer["hits"].astype("int") | |
# Look at the dtypes of the dataset | |
print(volunteer.dtypes) | |
------ | |
Stratified sampling | |
------ | |
# Create a data with all columns except category_desc | |
volunteer_X = volunteer.drop("category_desc", axis=1) | |
# Create a category_desc labels dataset | |
volunteer_y = volunteer[["category_desc"]] | |
# Use stratified sampling to split up the dataset according to the volunteer_y dataset | |
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y) | |
# Print out the category_desc counts on the training y labels | |
print(y_train["category_desc"].value_counts()) | |
----- | |
Log Normalization | |
----- | |
# Print out the variance of the Proline column | |
print(wine["Proline"].var()) | |
# Apply the log normalization function to the Proline column | |
wine["Proline_log"] = np.log(wine["Proline"]) | |
# Check the variance of the normalized Proline column | |
print(wine["Proline_log"].var()) | |
----- | |
Scaling data - standardizing columns | |
----- | |
# Import StandardScaler from scikit-learn | |
from sklearn.preprocessing import StandardScaler | |
# Create the scaler | |
ss = StandardScaler() | |
# Take a subset of the DataFrame you want to scale | |
wine_subset = wine[["Ash", "Alcalinity of ash", "Magnesium"]] | |
# Apply the scaler to the DataFrame subset | |
wine_subset_scaled = ss.fit_transform(wine_subset) | |
------ | |
LabelEncoder | |
------ | |
# Set up the LabelEncoder object | |
enc = LabelEncoder() | |
# Apply the encoding to the "Accessible" column | |
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"]) | |
# Compare the two columns | |
print(hiking[["Accessible_enc", "Accessible"]].head()) | |
----- | |
Encoding categorical variables - one-hot | |
---- | |
# Transform the category_desc column | |
category_enc = pd.get_dummies(volunteer["category_desc"]) | |
# Take a look at the encoded columns | |
print(category_enc.head()) | |
------ | |
Engineering numerical features - taking an average | |
------- | |
# Create a list of the columns to average | |
run_columns = ["run1", "run2", "run3", "run4", "run5"] | |
# Use apply to create a mean column | |
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1) | |
# Take a look at the results | |
print(running_times_5k) | |
------- | |
Engineering numerical features - datetime | |
------- | |
# First, convert string column to date column | |
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"]) | |
# Extract just the month from the converted column | |
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month) | |
# Take a look at the converted and new month columns | |
print(volunteer[["start_date_converted", "start_date_month"]].head()) | |
----------- | |
Engineering features from strings - extraction | |
----------- | |
# Write a pattern to extract numbers and decimals | |
def return_mileage(length): | |
pattern = re.compile(r"\d+\.\d+") | |
# Search the text for matches | |
mile = re.match(pattern, length) | |
# If a value is returned, use group(0) to return the found value | |
if mile is not None: | |
return float(mile.group(0)) | |
# Apply the function to the Length column and take a look at both columns | |
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row)) | |
print(hiking[["Length", "Length_num"]].head()) | |
------ | |
Engineering features from strings - tf/idf | |
------ | |
# Take the title text | |
title_text = volunteer["title"] | |
# Create the vectorizer method | |
tfidf_vec = TfidfVectorizer() | |
# Transform the text into tf-idf vectors | |
text_tfidf = tfidf_vec.fit_transform(title_text) | |
------- | |
Text classification using tf/idf vectors | |
------ | |
# Split the dataset according to the class distribution of category_desc | |
y = volunteer["category_desc"] | |
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y) | |
# Fit the model to the training data | |
nb.fit(X_train, y_train) | |
# Print out the model's accuracy | |
print(nb.score(X_test, y_test)) | |
----- | |
Exploring text vectors, part 1 | |
------ | |
# Add in the rest of the parameters | |
def return_weights(vocab, original_vocab, vector, vector_index, top_n): | |
zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data)) | |
# Let's transform that zipped dict into a series | |
zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices}) | |
# Let's sort the series to pull out the top n weighted words | |
zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index | |
return [original_vocab[i] for i in zipped_index] | |
# Print out the weighted words | |
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3)) | |
------ | |
Exploring text vectors, part 2 | |
----- | |
def words_to_filter(vocab, original_vocab, vector, top_n): | |
filter_list = [] | |
for i in range(0, vector.shape[0]): | |
# Here we'll call the function from the previous exercise, and extend the list we're creating | |
filtered = return_weights(vocab, original_vocab, vector, i, top_n) | |
filter_list.extend(filtered) | |
# Return the list in a set, so we don't get duplicate word indices | |
return set(filter_list) | |
# Call the function to get the list of word indices | |
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3) | |
# By converting filtered_words back to a list, we can use it to filter the columns in the text vector | |
filtered_text = text_tfidf[:, list(filtered_words)] | |
---- | |
# Split the dataset according to the class distribution of category_desc | |
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y) | |
# Fit the model to the training data | |
nb.fit(train_X, train_y) | |
# Print out the model's accuracy | |
print(nb.score(test_X, test_y)) | |
---- | |
Using PCA | |
----- | |
from sklearn.decomposition import PCA | |
# Set up PCA and the X vector for diminsionality reduction | |
pca = PCA() | |
wine_X = wine.drop("Type", axis=1) | |
# Apply PCA to the wine dataset | |
transformed_X = pca.fit_transform(wine_X) | |
# Look at the percentage of variance explained by the different components | |
print(pca.explained_variance_ratio_) | |
------ | |
UFO PROJECT | |
------ | |
Checking column types | |
---- | |
# Check the column types | |
print(ufo.dtypes) | |
# Change the type of seconds to float | |
ufo["seconds"] = ufo["seconds"].astype(float) | |
# Change the date column to type datetime | |
ufo["date"] = pd.to_datetime(ufo["date"]) | |
# Check the column types | |
print(ufo[["seconds", "date"]].dtypes) | |
--------- | |
Dropping missing data | |
------- | |
# Check how many values are missing in the length_of_time, state, and type columns | |
print(ufo[["length_of_time", "state", "type"]].isnull().sum()) | |
# Keep only rows where length_of_time, state, and type are not null | |
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & | |
ufo["state"].notnull() & | |
ufo["type"].notnull()] | |
# Print out the shape of the new dataset | |
print(ufo_no_missing.shape) | |
------ | |
Extracting numbers from strings | |
------ | |
def return_minutes(time_string): | |
# We'll use \d+ to grab digits and match it to the column values | |
pattern = re.compile(r"\d+") | |
# Use match on the pattern and column | |
num = re.match(pattern, time_string) | |
if num is not None: | |
return int(num.group(0)) | |
# Apply the extraction to the length_of_time column | |
ufo["minutes"] = ufo["length_of_time"].apply(return_minutes) | |
# Take a look at the head of both of the columns | |
print(ufo[["length_of_time", "minutes"]].head()) | |
------- | |
Identifying features for standardization | |
----- | |
# Check the variance of the seconds and minutes columns | |
print(ufo[["seconds", "minutes"]].var()) | |
# Log normalize the seconds column | |
ufo["seconds_log"] = np.log(ufo["seconds"]) | |
# Print out the variance of just the seconds_log column | |
print(ufo["seconds_log"].var()) | |
------- | |
Encoding categorical variables | |
----- | |
# Use Pandas to encode us values as 1 and others as 0 | |
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0) | |
# Print the number of unique type values | |
print(len(ufo["type"].unique())) | |
# Create a one-hot encoded set of the type values | |
type_set = pd.get_dummies(ufo["type"]) | |
# Concatenate this set back to the ufo DataFrame | |
ufo = pd.concat([ufo, type_set], axis=1) | |
-------- | |
Features from dates | |
-------- | |
# Look at the first 5 rows of the date column | |
print(ufo["date"].head()) | |
# Extract the month from the date column | |
ufo["month"] = ufo["date"].apply(lambda row: row.month) | |
# Extract the year from the date column | |
ufo["year"] = ufo["date"].apply(lambda row: row.year) | |
# Take a look at the head of all three columns | |
print(ufo[["date", "month", "year"]].head()) | |
--------- | |
Text vectorization | |
------- | |
# Take a look at the head of the desc field | |
print(ufo["desc"].head()) | |
# Create the tfidf vectorizer object | |
vec = TfidfVectorizer() | |
# Use vec's fit_transform method on the desc field | |
desc_tfidf = vec.fit_transform(ufo["desc"]) | |
# Look at the number of columns this creates. | |
print(desc_tfidf.shape) | |
------- | |
Selecting the ideal dataset | |
------ | |
# Check the correlation between the seconds, seconds_log, and minutes columns | |
print(ufo[["seconds", "seconds_log", "minutes"]].corr()) | |
# Make a list of features to drop | |
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"] | |
# Drop those features | |
ufo_dropped = ufo.drop(to_drop, axis=1) | |
# Let's also filter some words out of the text vector we created | |
filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4) | |
------ | |
Modeling the UFO dataset, part 1 | |
------ | |
# Take a look at the features in the X set of data | |
print(X.columns) | |
# Split the X and y sets using train_test_split, setting stratify=y | |
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y) | |
# Fit knn to the training sets | |
knn.fit(train_X, train_y) | |
# Print the score of knn on the test sets | |
print(knn.score(test_X, test_y)) | |
------ | |
Modeling the UFO dataset, part 2 | |
----- | |
# Use the list of filtered words we created to filter the text vector | |
filtered_text = desc_tfidf[:, list(filtered_words)] | |
# Split the X and y sets using train_test_split, setting stratify=y | |
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y) | |
# Fit nb to the training sets | |
nb.fit(train_X, train_y) | |
# Print the score of nb on the test sets | |
print(nb.score(test_X, test_y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment