Skip to content

Instantly share code, notes, and snippets.

@ryanpadilha
Created April 3, 2025 22:06
Show Gist options
  • Save ryanpadilha/a682b52672390be515dfff77bdb0cfb4 to your computer and use it in GitHub Desktop.
Save ryanpadilha/a682b52672390be515dfff77bdb0cfb4 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from google.colab import drive
drive.mount('/content/drive')
# Carregar os dados
file_path = "/content/drive/MyDrive/mex-data/Online Retail.xlsx"
df = pd.read_excel(file_path)
# Remover valores nulos
df = df.dropna()
# Remover valores negativos
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]
# Criar a feature de valor total gasto
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
# Agrupar por Cliente e calcular métricas relevantes
customer_data = df.groupby("CustomerID").agg({
"InvoiceNo": "count", # Número de compras
"Quantity": "sum", # Quantidade total comprada
"TotalPrice": "sum" # Total gasto
}).rename(columns={"InvoiceNo": "NumCompras"})
# Normalizar os dados
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data)
# Definir o número de clusters
k = 3 # Ajuste conforme necessário
# Criar e treinar o modelo K-Means
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
customer_data["Cluster"] = kmeans.fit_predict(customer_data_scaled)
# Visualizar os clusters
print(customer_data.head())
# Gráfico de dispersão dos clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
x=customer_data["TotalPrice"],
y=customer_data["NumCompras"],
hue=customer_data["Cluster"],
palette="viridis"
)
plt.xlabel("Total Gasto")
plt.ylabel("Número de Compras")
plt.title("Segmentação de Clientes")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment