vidit0210 · March 1, 2020 10:29
diff --git a/Preprocessing in Machine Learning-DataCamp b/Preprocessing in Machine Learning-DataCamp
 ---
 Missing data - rows
 ---
 # Check how many values are missing in the category_desc column
 print(volunteer["category_desc"].isnull().sum())

 # Subset the volunteer dataset
 volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

 # Print out the shape of the subset
 print(volunteer_subset.shape)
 -----
 Converting a column type
 -----

 # Print the head of the hits column
 print(volunteer["hits"].head())

 # Convert the hits column to type int
 volunteer["hits"] = volunteer["hits"].astype("int")

 # Look at the dtypes of the dataset
 print(volunteer.dtypes)
 ------
 Stratified sampling
 ------
 # Create a data with all columns except category_desc
 volunteer_X = volunteer.drop("category_desc", axis=1)

 # Create a category_desc labels dataset
 volunteer_y = volunteer[["category_desc"]]

 # Use stratified sampling to split up the dataset according to the volunteer_y dataset
 X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

 # Print out the category_desc counts on the training y labels
 print(y_train["category_desc"].value_counts())
 -----
 Log Normalization
 -----
 # Print out the variance of the Proline column
 print(wine["Proline"].var())

 # Apply the log normalization function to the Proline column
 wine["Proline_log"] = np.log(wine["Proline"])

 # Check the variance of the normalized Proline column
 print(wine["Proline_log"].var())
 -----
 Scaling data - standardizing columns
 -----
 # Import StandardScaler from scikit-learn
 from sklearn.preprocessing import StandardScaler

 # Create the scaler
 ss = StandardScaler()

 # Take a subset of the DataFrame you want to scale 
 wine_subset = wine[["Ash", "Alcalinity of ash", "Magnesium"]]

 # Apply the scaler to the DataFrame subset
 wine_subset_scaled = ss.fit_transform(wine_subset)
 ------
 LabelEncoder
 ------
 # Set up the LabelEncoder object
 enc = LabelEncoder()

 # Apply the encoding to the "Accessible" column
 hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

 # Compare the two columns
 print(hiking[["Accessible_enc", "Accessible"]].head())
 -----
 Encoding categorical variables - one-hot
 ----
 # Transform the category_desc column
 category_enc = pd.get_dummies(volunteer["category_desc"])

 # Take a look at the encoded columns
 print(category_enc.head())
 ------
 Engineering numerical features - taking an average
 -------
 # Create a list of the columns to average
 run_columns = ["run1", "run2", "run3", "run4", "run5"]

 # Use apply to create a mean column
 running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

 # Take a look at the results
 print(running_times_5k)
 -------
 Engineering numerical features - datetime
 -------
 # First, convert string column to date column
 volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

 # Extract just the month from the converted column
 volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)

 # Take a look at the converted and new month columns
 print(volunteer[["start_date_converted", "start_date_month"]].head())
 -----------
 Engineering features from strings - extraction
 -----------
 # Write a pattern to extract numbers and decimals
 def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
 # Apply the function to the Length column and take a look at both columns
 hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
 print(hiking[["Length", "Length_num"]].head())
 ------
 Engineering features from strings - tf/idf
 ------
 # Take the title text
 title_text = volunteer["title"]

 # Create the vectorizer method
 tfidf_vec = TfidfVectorizer()

 # Transform the text into tf-idf vectors
 text_tfidf = tfidf_vec.fit_transform(title_text)
 -------
 Text classification using tf/idf vectors
 ------
 # Split the dataset according to the class distribution of category_desc
 y = volunteer["category_desc"]
 X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

 # Fit the model to the training data
 nb.fit(X_train, y_train)

 # Print out the model's accuracy
 print(nb.score(X_test, y_test))
 -----
 Exploring text vectors, part 1
 ------
 # Add in the rest of the parameters
 def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

 # Print out the weighted words
 print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))
 ------
 Exploring text vectors, part 2
 -----
 def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

 # Call the function to get the list of word indices
 filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

 # By converting filtered_words back to a list, we can use it to filter the columns in the text vector
 filtered_text = text_tfidf[:, list(filtered_words)]
 ----
 # Split the dataset according to the class distribution of category_desc
 train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

 # Fit the model to the training data
 nb.fit(train_X, train_y)

 # Print out the model's accuracy
 print(nb.score(test_X, test_y))
 ----
 Using PCA
 -----
 from sklearn.decomposition import PCA

 # Set up PCA and the X vector for diminsionality reduction
 pca = PCA()
 wine_X = wine.drop("Type", axis=1)

 # Apply PCA to the wine dataset
 transformed_X = pca.fit_transform(wine_X)

 # Look at the percentage of variance explained by the different components
 print(pca.explained_variance_ratio_)
 ------
 UFO PROJECT
 ------
 Checking column types

 ----
 # Check the column types
 print(ufo.dtypes)

 # Change the type of seconds to float
 ufo["seconds"] = ufo["seconds"].astype(float)

 # Change the date column to type datetime
 ufo["date"] = pd.to_datetime(ufo["date"])

 # Check the column types
 print(ufo[["seconds", "date"]].dtypes)
 ---------
 Dropping missing data
 -------
 # Check how many values are missing in the length_of_time, state, and type columns
 print(ufo[["length_of_time", "state", "type"]].isnull().sum())

 # Keep only rows where length_of_time, state, and type are not null
 ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

 # Print out the shape of the new dataset
 print(ufo_no_missing.shape)
 ------
 Extracting numbers from strings
 ------
 def return_minutes(time_string):
    
    # We'll use \d+ to grab digits and match it to the column values
    pattern = re.compile(r"\d+")
        
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
 # Apply the extraction to the length_of_time column
 ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)

 # Take a look at the head of both of the columns
 print(ufo[["length_of_time", "minutes"]].head())
 -------
 Identifying features for standardization
 -----
 # Check the variance of the seconds and minutes columns
 print(ufo[["seconds", "minutes"]].var())

 # Log normalize the seconds column
 ufo["seconds_log"] = np.log(ufo["seconds"])

 # Print out the variance of just the seconds_log column
 print(ufo["seconds_log"].var())
 -------
 Encoding categorical variables
 -----
 # Use Pandas to encode us values as 1 and others as 0
 ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)

 # Print the number of unique type values
 print(len(ufo["type"].unique()))

 # Create a one-hot encoded set of the type values
 type_set = pd.get_dummies(ufo["type"])

 # Concatenate this set back to the ufo DataFrame
 ufo = pd.concat([ufo, type_set], axis=1)
 --------
 Features from dates
 --------
 # Look at the first 5 rows of the date column
 print(ufo["date"].head())

 # Extract the month from the date column
 ufo["month"] = ufo["date"].apply(lambda row: row.month)

 # Extract the year from the date column
 ufo["year"] = ufo["date"].apply(lambda row: row.year)

 # Take a look at the head of all three columns
 print(ufo[["date", "month", "year"]].head())
 ---------
 Text vectorization
 -------
 # Take a look at the head of the desc field
 print(ufo["desc"].head())

 # Create the tfidf vectorizer object
 vec = TfidfVectorizer()

 # Use vec's fit_transform method on the desc field
 desc_tfidf = vec.fit_transform(ufo["desc"])

 # Look at the number of columns this creates.
 print(desc_tfidf.shape)
 -------
 Selecting the ideal dataset
 ------
 # Check the correlation between the seconds, seconds_log, and minutes columns
 print(ufo[["seconds", "seconds_log", "minutes"]].corr())

 # Make a list of features to drop   
 to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

 # Drop those features
 ufo_dropped = ufo.drop(to_drop, axis=1)

 # Let's also filter some words out of the text vector we created
 filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)
 ------
 Modeling the UFO dataset, part 1
 ------
 # Take a look at the features in the X set of data
 print(X.columns)

 # Split the X and y sets using train_test_split, setting stratify=y
 train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

 # Fit knn to the training sets
 knn.fit(train_X, train_y)

 # Print the score of knn on the test sets
 print(knn.score(test_X, test_y))
 ------
 Modeling the UFO dataset, part 2
 -----
 # Use the list of filtered words we created to filter the text vector
 filtered_text = desc_tfidf[:, list(filtered_words)]

 # Split the X and y sets using train_test_split, setting stratify=y 
 train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

 # Fit nb to the training sets
 nb.fit(train_X, train_y)

 # Print the score of nb on the test sets
 print(nb.score(test_X, test_y))
	---
	Missing data - rows
	---
	# Check how many values are missing in the category_desc column
	print(volunteer["category_desc"].isnull().sum())

	# Subset the volunteer dataset
	volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

	# Print out the shape of the subset
	print(volunteer_subset.shape)
	-----
	Converting a column type
	-----

	# Print the head of the hits column
	print(volunteer["hits"].head())

	# Convert the hits column to type int
	volunteer["hits"] = volunteer["hits"].astype("int")

	# Look at the dtypes of the dataset
	print(volunteer.dtypes)
	------
	Stratified sampling
	------
	# Create a data with all columns except category_desc
	volunteer_X = volunteer.drop("category_desc", axis=1)

	# Create a category_desc labels dataset
	volunteer_y = volunteer[["category_desc"]]

	# Use stratified sampling to split up the dataset according to the volunteer_y dataset
	X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

	# Print out the category_desc counts on the training y labels
	print(y_train["category_desc"].value_counts())
	-----
	Log Normalization
	-----
	# Print out the variance of the Proline column
	print(wine["Proline"].var())

	# Apply the log normalization function to the Proline column
	wine["Proline_log"] = np.log(wine["Proline"])

	# Check the variance of the normalized Proline column
	print(wine["Proline_log"].var())
	-----
	Scaling data - standardizing columns
	-----
	# Import StandardScaler from scikit-learn
	from sklearn.preprocessing import StandardScaler

	# Create the scaler
	ss = StandardScaler()

	# Take a subset of the DataFrame you want to scale
	wine_subset = wine[["Ash", "Alcalinity of ash", "Magnesium"]]

	# Apply the scaler to the DataFrame subset
	wine_subset_scaled = ss.fit_transform(wine_subset)
	------
	LabelEncoder
	------
	# Set up the LabelEncoder object
	enc = LabelEncoder()

	# Apply the encoding to the "Accessible" column
	hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

	# Compare the two columns
	print(hiking[["Accessible_enc", "Accessible"]].head())
	-----
	Encoding categorical variables - one-hot
	----
	# Transform the category_desc column
	category_enc = pd.get_dummies(volunteer["category_desc"])

	# Take a look at the encoded columns
	print(category_enc.head())
	------
	Engineering numerical features - taking an average
	-------
	# Create a list of the columns to average
	run_columns = ["run1", "run2", "run3", "run4", "run5"]

	# Use apply to create a mean column
	running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

	# Take a look at the results
	print(running_times_5k)
	-------
	Engineering numerical features - datetime
	-------
	# First, convert string column to date column
	volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

	# Extract just the month from the converted column
	volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)

	# Take a look at the converted and new month columns
	print(volunteer[["start_date_converted", "start_date_month"]].head())
	-----------
	Engineering features from strings - extraction
	-----------
	# Write a pattern to extract numbers and decimals
	def return_mileage(length):
	pattern = re.compile(r"\d+\.\d+")

	# Search the text for matches
	mile = re.match(pattern, length)

	# If a value is returned, use group(0) to return the found value
	if mile is not None:
	return float(mile.group(0))

	# Apply the function to the Length column and take a look at both columns
	hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
	print(hiking[["Length", "Length_num"]].head())
	------
	Engineering features from strings - tf/idf
	------
	# Take the title text
	title_text = volunteer["title"]

	# Create the vectorizer method
	tfidf_vec = TfidfVectorizer()

	# Transform the text into tf-idf vectors
	text_tfidf = tfidf_vec.fit_transform(title_text)
	-------
	Text classification using tf/idf vectors
	------
	# Split the dataset according to the class distribution of category_desc
	y = volunteer["category_desc"]
	X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

	# Fit the model to the training data
	nb.fit(X_train, y_train)

	# Print out the model's accuracy
	print(nb.score(X_test, y_test))
	-----
	Exploring text vectors, part 1
	------
	# Add in the rest of the parameters
	def return_weights(vocab, original_vocab, vector, vector_index, top_n):
	zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))

	# Let's transform that zipped dict into a series
	zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})

	# Let's sort the series to pull out the top n weighted words
	zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
	return [original_vocab[i] for i in zipped_index]

	# Print out the weighted words
	print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))
	------
	Exploring text vectors, part 2
	-----
	def words_to_filter(vocab, original_vocab, vector, top_n):
	filter_list = []
	for i in range(0, vector.shape[0]):

	# Here we'll call the function from the previous exercise, and extend the list we're creating
	filtered = return_weights(vocab, original_vocab, vector, i, top_n)
	filter_list.extend(filtered)
	# Return the list in a set, so we don't get duplicate word indices
	return set(filter_list)

	# Call the function to get the list of word indices
	filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

	# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
	filtered_text = text_tfidf[:, list(filtered_words)]
	----
	# Split the dataset according to the class distribution of category_desc
	train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

	# Fit the model to the training data
	nb.fit(train_X, train_y)

	# Print out the model's accuracy
	print(nb.score(test_X, test_y))
	----
	Using PCA
	-----
	from sklearn.decomposition import PCA

	# Set up PCA and the X vector for diminsionality reduction
	pca = PCA()
	wine_X = wine.drop("Type", axis=1)

	# Apply PCA to the wine dataset
	transformed_X = pca.fit_transform(wine_X)

	# Look at the percentage of variance explained by the different components
	print(pca.explained_variance_ratio_)
	------
	UFO PROJECT
	------
	Checking column types

	----
	# Check the column types
	print(ufo.dtypes)

	# Change the type of seconds to float
	ufo["seconds"] = ufo["seconds"].astype(float)

	# Change the date column to type datetime
	ufo["date"] = pd.to_datetime(ufo["date"])

	# Check the column types
	print(ufo[["seconds", "date"]].dtypes)
	---------
	Dropping missing data
	-------
	# Check how many values are missing in the length_of_time, state, and type columns
	print(ufo[["length_of_time", "state", "type"]].isnull().sum())

	# Keep only rows where length_of_time, state, and type are not null
	ufo_no_missing = ufo[ufo["length_of_time"].notnull() &
	ufo["state"].notnull() &
	ufo["type"].notnull()]

	# Print out the shape of the new dataset
	print(ufo_no_missing.shape)
	------
	Extracting numbers from strings
	------
	def return_minutes(time_string):

	# We'll use \d+ to grab digits and match it to the column values
	pattern = re.compile(r"\d+")

	# Use match on the pattern and column
	num = re.match(pattern, time_string)
	if num is not None:
	return int(num.group(0))

	# Apply the extraction to the length_of_time column
	ufo["minutes"] = ufo["length_of_time"].apply(return_minutes)

	# Take a look at the head of both of the columns
	print(ufo[["length_of_time", "minutes"]].head())
	-------
	Identifying features for standardization
	-----
	# Check the variance of the seconds and minutes columns
	print(ufo[["seconds", "minutes"]].var())

	# Log normalize the seconds column
	ufo["seconds_log"] = np.log(ufo["seconds"])

	# Print out the variance of just the seconds_log column
	print(ufo["seconds_log"].var())
	-------
	Encoding categorical variables
	-----
	# Use Pandas to encode us values as 1 and others as 0
	ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)

	# Print the number of unique type values
	print(len(ufo["type"].unique()))

	# Create a one-hot encoded set of the type values
	type_set = pd.get_dummies(ufo["type"])

	# Concatenate this set back to the ufo DataFrame
	ufo = pd.concat([ufo, type_set], axis=1)
	--------
	Features from dates
	--------
	# Look at the first 5 rows of the date column
	print(ufo["date"].head())

	# Extract the month from the date column
	ufo["month"] = ufo["date"].apply(lambda row: row.month)

	# Extract the year from the date column
	ufo["year"] = ufo["date"].apply(lambda row: row.year)

	# Take a look at the head of all three columns
	print(ufo[["date", "month", "year"]].head())
	---------
	Text vectorization
	-------
	# Take a look at the head of the desc field
	print(ufo["desc"].head())

	# Create the tfidf vectorizer object
	vec = TfidfVectorizer()

	# Use vec's fit_transform method on the desc field
	desc_tfidf = vec.fit_transform(ufo["desc"])

	# Look at the number of columns this creates.
	print(desc_tfidf.shape)
	-------
	Selecting the ideal dataset
	------
	# Check the correlation between the seconds, seconds_log, and minutes columns
	print(ufo[["seconds", "seconds_log", "minutes"]].corr())

	# Make a list of features to drop
	to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

	# Drop those features
	ufo_dropped = ufo.drop(to_drop, axis=1)

	# Let's also filter some words out of the text vector we created
	filtered_words = words_to_filter(vocab, vec.vocabulary_, desc_tfidf, 4)
	------
	Modeling the UFO dataset, part 1
	------
	# Take a look at the features in the X set of data
	print(X.columns)

	# Split the X and y sets using train_test_split, setting stratify=y
	train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

	# Fit knn to the training sets
	knn.fit(train_X, train_y)

	# Print the score of knn on the test sets
	print(knn.score(test_X, test_y))
	------
	Modeling the UFO dataset, part 2
	-----
	# Use the list of filtered words we created to filter the text vector
	filtered_text = desc_tfidf[:, list(filtered_words)]

	# Split the X and y sets using train_test_split, setting stratify=y
	train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

	# Fit nb to the training sets
	nb.fit(train_X, train_y)

	# Print the score of nb on the test sets
	print(nb.score(test_X, test_y))