Last active
August 8, 2020 23:35
-
-
Save adraguidev/a4c99c6fec90705d7541f196acdd4711 to your computer and use it in GitHub Desktop.
SKLearn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.tree import DecisionTreeClassifier | |
Tree = DecisionTreeClassifier(max_depth=3, random_state = 42) | |
dec_tree = tree.fit(X_train, y_train) | |
y_pred = tree.predict(X_test) | |
# Función que nos ayuda a graficar | |
# No hace falta que comprandan este bloque de código. | |
def visualize_classifier(model, X, y, ax=None, cmap='bwr'): | |
ax = ax or plt.gca() | |
# Plot the training points | |
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap, | |
clim=(y.min(), y.max()), zorder=3, alpha = 0.5) | |
ax.axis('tight') | |
ax.set_xlabel('x1') | |
ax.set_ylabel('x2') | |
# ax.axis('off') | |
xlim = ax.get_xlim() | |
ylim = ax.get_ylim() | |
xx, yy = np.meshgrid(np.linspace(*xlim, num=200), | |
np.linspace(*ylim, num=200)) | |
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
# Create a color plot with the results | |
n_classes = len(np.unique(y)) | |
contours = ax.contourf(xx, yy, Z, alpha=0.3, | |
levels=np.arange(n_classes + 1) - 0.5, | |
cmap=cmap, clim=(y.min(), y.max()), | |
zorder=1) | |
ax.set(xlim=xlim, ylim=ylim) | |
visualize_classifier(tree, X, y) | |
from sklearn.metrics import accuracy_score | |
# Predecimos sobre nuestro set de entrenamieto | |
y_pred = tree.predict(X) | |
# Comaparamos con las etiquetas reales | |
accuracy_score(y_pred,y) | |
#Analisis bajo la matriz de confusión | |
from sklearn.metrics import confusion_matrix | |
print(confusion_matrix(y,y_pred)) | |
#Graficamos la matríz de confusión | |
from sklearn.metrics import plot_confusion_matrix | |
plot_confusion_matrix(tree, X, y, cmap=plt.cm.Blues, values_format = '.0f') | |
#Versión normalizadad | |
plot_confusion_matrix(tree, X, y, cmap=plt.cm.Blues, values_format = '.2f', normalize= 'true') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from matplotlib import cm | |
from sklearn.metrics import silhouette_samples | |
import matplotlib.pyplot as plt | |
import warnings | |
warnings.filterwarnings('ignore') | |
plt.rcParams['figure.figsize'] = (14, 4) | |
def Grafico_de_silueta(X,n_cluster_list,init,n_init,max_iter,tol): | |
cont=0 | |
for i in n_cluster_list: | |
cont += 1 | |
plt.subplot(1, 4, cont) | |
km = KMeans(n_clusters=i, | |
init=init, #elija k observaciones (filas) para los centroides iniciales | |
n_init=n_init, #número de veces que el algoritmo se ejecutará | |
max_iter=max_iter, #número máximo de iteraciones para una ejecución | |
tol=tol, #tolerancia para declarar convergencia | |
random_state=0) #semilla | |
y_km = km.fit_predict(X) | |
cluster_labels = np.unique(y_km) #valores de clúster | |
n_clusters = cluster_labels.shape[0] #núnero de clústers | |
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') #valores de silueta teniendo en cuenta la distancia euclideana | |
y_ax_lower, y_ax_upper = 0, 0 | |
yticks = [] | |
for i, c in enumerate(cluster_labels): | |
c_silhouette_vals = silhouette_vals[y_km == c] #valores de silueta cuando y_km toma el valor c de los posibles n de clúster | |
c_silhouette_vals.sort() #se ordenan de menor a mayor los valores de silueta | |
y_ax_upper += len(c_silhouette_vals) #número de valores de silueta | |
color = cm.jet(float(i) / km.n_clusters) # definir el color | |
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, | |
edgecolor='none', color=color) #visualización de los valores de silueta para k | |
yticks.append((y_ax_lower + y_ax_upper) / 2.) | |
y_ax_lower += len(c_silhouette_vals) | |
silhouette_avg = np.mean(silhouette_vals)#media de los valores de silueta | |
plt.axvline(silhouette_avg, color="red", linestyle="--") # mostrar una línea con los valores medios de silueta | |
plt.yticks(yticks, cluster_labels + 1) | |
plt.ylabel('Cluster') | |
plt.xlabel('Coeficiente de Silueta') | |
plt.title("Silouette para k= " + str(km.n_clusters) + "\n" + "Coeficiente de Silueta= "+str(round((silhouette_avg),2))) | |
plt.tight_layout() | |
plt.show() | |
#Defino el numero de cluster en un array | |
n_cluster_list=[2,3,4,5] | |
init='k-means++' #elija k observaciones (filas) para los centroides iniciales | |
n_init=10 #número de veces que el algoritmo se ejecutará | |
max_iter=300#número máximo de iteraciones para una ejecución | |
tol=1e-04 #tolerancia para declarar convergencia | |
Grafico_de_silueta(X,n_cluster_list,init,n_init,max_iter,tol) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Librerias a utilizar | |
#from sklearn.cluster import KMeans | |
import random | |
#Definir la función | |
def Grafico_de_cluster(X,n_clusters,init,n_init,max_iter,tol,name_colors_c_list,name_colors_edgecolor_list,name_makers_list): | |
km = KMeans(n_clusters=n_clusters, | |
init=init, #elija k observaciones (filas) para los centroides iniciales | |
n_init=n_init, #número de veces que el algoritmo se ejecutará | |
max_iter=max_iter,#número máximo de iteraciones para una ejecución | |
tol=tol, #tolerancia para declarar convergencia | |
random_state=0) #semilla | |
y_km = km.fit_predict(X) | |
for i in range(0,km.n_clusters): | |
plt.scatter(X[y_km == i, 0], #primer clúster | |
X[y_km == i, 1], | |
s=50, | |
c=name_colors_c_list[i],#El color de los puntos | |
edgecolor=name_colors_edgecolor_list[i],#El punto de colors | |
marker=name_makers_list[random.randint(0,1)],#El tipo de representación | |
label='cluster '+str(i+1)) | |
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], | |
s=250, marker='*', c='red', label='centroides') | |
plt.legend() | |
plt.grid() | |
plt.tight_layout() | |
plt.show() | |
#Defino el numero de cluster en un array | |
name_colors_c_list=['lightgreen','orange','yellow']#Defino el color de los puntos | |
name_colors_edgecolor_list=['black','blue','red']#Defino los puntos de color | |
name_makers_list=['s','o','v']#Defino el tipo de representación del punto | |
n_clusters=3#Se define el numero de clusters | |
init='k-means++' #elija k observaciones (filas) para los centroides iniciales | |
n_init=10 #número de veces que el algoritmo se ejecutará | |
max_iter=300#número máximo de iteraciones para una ejecución | |
tol=1e-04 #tolerancia para declarar convergencia | |
Grafico_de_cluster(X,n_clusters,init,n_init,max_iter,tol,name_colors_c_list,name_colors_edgecolor_list,name_makers_list) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import load_breast_cancer | |
from sklearn.cluster import KMeans | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
from sklearn.preprocessing import scale | |
import pandas as pd | |
bc = load_breast_cancer() | |
print(bc) | |
X = scale(bc.data) | |
print(X) | |
y = bc.target | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
model = KMeans(n_clusters=2, random_state=0) | |
model.fit(X_train) | |
predictions = model.predict(X_test) | |
labels = model.labels_ | |
print("labels: ", labels) | |
print("Predictions: ", predictions) | |
print("accuracy: ", accuracy_score(y_test, predictions)) | |
print("Actual: ", y_test) | |
from sklearn import metrics | |
# Commented out IPython magic to ensure Python compatibility. | |
def bench_k_means(estimator, name, data): | |
estimator.fit(data) | |
print('%-9s\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' | |
# % (name, estimator.inertia_, | |
metrics.homogeneity_score(y, estimator.labels_), | |
metrics.completeness_score(y, estimator.labels_), | |
metrics.v_measure_score(y, estimator.labels_), | |
metrics.adjusted_rand_score(y, estimator.labels_), | |
metrics.adjusted_mutual_info_score(y, estimator.labels_), | |
metrics.silhouette_score(data, estimator.labels_, | |
metric='euclidean'))) | |
bench_k_means(model, "1", X) | |
print(pd.crosstab(y_train, labels)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import neighbors, metrics, svm | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics import accuracy_score | |
#Forma 1 de encodear datos | |
Le = LabelEncoder() | |
for i in range(len(X[0])): | |
X[:, i] = Le.fit_transform(X[:, i]) | |
print(X) | |
#Forma 2 de encodear datos | |
label_mapping = { | |
'unacc':0, | |
'acc':1, | |
'good':2, | |
'vgood':3 | |
} | |
y['class'] = y['class'].map(label_mapping) | |
y = np.array(y) | |
#Traintestsplit | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
#Creación del modelo | |
knn = svm.SVC() | |
knn.fit(X_train, y_train) | |
prediction = knn.predict(X_test) | |
#Calculamos el accuracy de nuestro modelo | |
accuracy = metrics.accuracy_score(y_test, prediction) | |
print("predictions:", prediction) | |
print("accuracy: ", accuracy) | |
#Podemos verificar si los valores fueron acertados o no | |
a = 1727 | |
print("actual value ", y[a]) | |
print("predicted value", knn.predict(X)[a]) | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import linear_model | |
from sklearn.model_selection import train_test_split | |
#algorithm | |
l_reg = linear_mode.LinearRegression() | |
plt.scatter(X.T[0].y) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
#train | |
model = l_reg.fit(X_train, y_train) | |
predictions = model.predict(X_test) | |
print("predictions: ", predictions) | |
print("R2 value: ", l_reg.score(X,y)) | |
print("coedd:", l_reg.coef_) | |
print("intercept: ", l_reg.intercept_) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import confusion_matrix | |
print(confusion_matrix(y,y_pred)) | |
#Graficamos la matríz de confusión | |
from sklearn.metrics import plot_confusion_matrix | |
plot_confusion_matrix(tree, X, y, cmap=plt.cm.Blues, values_format = '.0f') | |
#Versión normalizadad | |
plot_confusion_matrix(tree, X, y, cmap=plt.cm.Blues, values_format = '.2f', normalize= 'true') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
distortions = [] | |
for i in range(2, 11): #bucle de 2 a 11 | |
km = KMeans(n_clusters=i, #número de clúster a formar | |
init='k-means++', #elija k observaciones (filas) para los centroides iniciales | |
n_init=10, #número de veces que el algoritmo se ejecutará | |
max_iter=300, #número máximo de iteraciones para una ejecución | |
random_state=0) | |
km.fit(X) | |
distortions.append(km.inertia_) | |
plt.plot(range(2, 11), #valores a mostrar | |
distortions, #objeto a mostrar | |
marker='v') #marcador | |
plt.xlabel('número de clúster') | |
plt.ylabel('Distorsión') | |
plt.title("Método del codo para data simulada") | |
plt.tight_layout() | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import euclidean_distances, silhouette_score | |
sil = [] | |
for i in range(2,11): | |
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0) | |
sil.append(silhouette_score(X, kmeans.fit_predict(X))) | |
plt.plot(range(2, 11),sil) | |
plt.ylabel("Silouette") | |
plt.xlabel("k") | |
plt.title("Método de Silouette para data simulada") | |
plt.tight_layout() | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
from sklearn import svm | |
from sklearn.metrics import accuracy_score | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
model = svm.SVC() | |
model.fit(X_train, y_train) | |
predictions = model.predict(X_test) | |
#Calculamos el accuracy de nuestro modelo | |
accuracy = metrics.accuracy_score(y_test, prediction) | |
print("predictions:", prediction) | |
print("accuracy: ", accuracy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment