glamp · December 3, 2018 04:36 · glamp · Aug 29, 2014 · actsasflinn · Dec 23, 2014
diff --git a/lending_club_find_features.R b/lending_club_find_features.R
 #figure out which columns are numeirc (and hence we can look at the distribution)
 numeric_cols <- sapply(df, is.numeric)
 #turn the data into long format (key->value esque)
 df.lng <- melt(df[,numeric_cols], id="is_bad")
 head(df.lng)

 #plot the distribution for bads and goods for each variable
 p <- ggplot(aes(x=value, group=is_bad, colour=factor(is_bad)), data=df.lng)
 #quick and dirty way to figure out if you have any good variables
 p + geom_density() +
  facet_wrap(~variable, scales="free")

 #NOTES:
 # - be careful of using variables that get created AFTER a loan is issued (prinicpal/interest related)
 # - any ID variables that are numeric will be plotted as well. be sure to ignore those as well.
	#figure out which columns are numeirc (and hence we can look at the distribution)
	numeric_cols <- sapply(df, is.numeric)
	#turn the data into long format (key->value esque)
	df.lng <- melt(df[,numeric_cols], id="is_bad")
	head(df.lng)

	#plot the distribution for bads and goods for each variable
	p <- ggplot(aes(x=value, group=is_bad, colour=factor(is_bad)), data=df.lng)
	#quick and dirty way to figure out if you have any good variables
	p + geom_density() +
	facet_wrap(~variable, scales="free")

	#NOTES:
	# - be careful of using variables that get created AFTER a loan is issued (prinicpal/interest related)
	# - any ID variables that are numeric will be plotted as well. be sure to ignore those as well.