Skip to content

Instantly share code, notes, and snippets.

@Lucs1590
Last active December 10, 2025 02:16
Show Gist options
  • Select an option

  • Save Lucs1590/f0fa5a42d9a7889b6b5d850fab9e8985 to your computer and use it in GitHub Desktop.

Select an option

Save Lucs1590/f0fa5a42d9a7889b6b5d850fab9e8985 to your computer and use it in GitHub Desktop.
This script loads SQL queries from a CSV file, generates embeddings using a sentence transformer model, stores them in ChromaDB, and performs clustering analysis. It groups similar queries using K-means, computes pairwise cosine similarity to find the most similar query pairs, and visualizes the clusters in 2D space using t-SNE dimensionality re…
import chromadb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
NUMBER_CLUSTERS = 20
sql_queries = pd.read_csv('41417958-3bac-4c5a-aa25-f9df0a62a676.csv')['query'].tolist()
def main():
print("Starting ChromaDB SQL Query Similarity Analysis...")
_, collection = setup_chroma_db()
store_queries_in_chroma(collection, sql_queries)
embeddings, documents, _ = get_embeddings_for_clustering(collection)
cluster_labels, df = perform_clustering(
embeddings,
documents,
n_clusters=NUMBER_CLUSTERS
)
analyze_query_similarity_matrix(embeddings, documents)
visualize_clusters(embeddings, cluster_labels, documents)
def setup_chroma_db():
"""Initialize ChromaDB client and collection"""
client = chromadb.Client()
try:
client.delete_collection("sql_queries")
except Exception as e:
print(f"Error deleting collection: {e}")
collection = client.create_collection(
name="sql_queries",
metadata={"description": "SQL queries for similarity analysis"}
)
return client, collection
def store_queries_in_chroma(collection, queries):
"""Store SQL queries in ChromaDB"""
ids = [f"query_{i}" for i in range(len(queries))]
collection.add(
documents=queries,
ids=ids,
metadatas=[
{"type": "sql_query", "index": i}
for i in range(len(queries))
]
)
print(f"Stored {len(queries)} SQL queries in ChromaDB")
def find_similar_queries(collection, query, n_results=3):
"""Find similar queries using ChromaDB"""
results = collection.query(
query_texts=[query],
n_results=n_results
)
print(f"\nQuery: {query}")
print("Similar queries:")
for i, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])):
similarity = 1 - distance
print(f" {i+1}. Similarity: {similarity:.3f} - {doc}")
def get_embeddings_for_clustering(collection):
"""Get embeddings from ChromaDB for clustering"""
results = collection.get()
documents = results['documents']
ids = results['ids']
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)
return embeddings, documents, ids
def perform_clustering(embeddings, documents, n_clusters=5):
"""Perform K-means clustering on SQL queries"""
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)
df = pd.DataFrame({
'query': documents,
'cluster': cluster_labels
})
print(f"\nClustering Results ({n_clusters} clusters):")
for cluster_id in range(n_clusters):
cluster_queries = df[df['cluster'] == cluster_id]['query'].tolist()
print(f"\nCluster {cluster_id} ({len(cluster_queries)} queries):")
for query in cluster_queries:
print(f" - {query}")
return cluster_labels, df
def visualize_clusters(embeddings, cluster_labels, documents):
tsne = TSNE(
n_components=2,
random_state=42,
perplexity=min(30, len(documents)-1)
)
embeddings_2d = tsne.fit_transform(embeddings)
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
embeddings_2d[:, 0],
embeddings_2d[:, 1],
c=cluster_labels,
# cmap='tab10',
alpha=0.7
)
for i, _ in enumerate(documents):
label = f'query_{i}'
plt.annotate(
label,
(embeddings_2d[i, 0], embeddings_2d[i, 1]),
fontsize=8,
alpha=0.7
)
plt.title('SQL Queries Clustering Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(scatter, label='Cluster Label')
plt.tight_layout()
plt.savefig('sql_queries_clustering.png')
def analyze_query_similarity_matrix(embeddings, documents):
similarity_matrix = cosine_similarity(embeddings)
print("\nTop 5 Most Similar Query Pairs:")
print("=" * 50)
indices = np.triu_indices_from(similarity_matrix, k=1)
similarities = similarity_matrix[indices]
top_indices = np.argsort(similarities)[-5:][::-1]
for idx in top_indices:
i, j = indices[0][idx], indices[1][idx]
similarity = similarities[idx]
print(f"Similarity: {similarity:.3f}")
print(f" Query 1: {documents[i]}")
print(f" Query 2: {documents[j]}")
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment