Last active
December 10, 2025 02:16
-
-
Save Lucs1590/f0fa5a42d9a7889b6b5d850fab9e8985 to your computer and use it in GitHub Desktop.
This script loads SQL queries from a CSV file, generates embeddings using a sentence transformer model, stores them in ChromaDB, and performs clustering analysis. It groups similar queries using K-means, computes pairwise cosine similarity to find the most similar query pairs, and visualizes the clusters in 2D space using t-SNE dimensionality re…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import chromadb | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import KMeans | |
| from sklearn.manifold import TSNE | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| NUMBER_CLUSTERS = 20 | |
| sql_queries = pd.read_csv('41417958-3bac-4c5a-aa25-f9df0a62a676.csv')['query'].tolist() | |
| def main(): | |
| print("Starting ChromaDB SQL Query Similarity Analysis...") | |
| _, collection = setup_chroma_db() | |
| store_queries_in_chroma(collection, sql_queries) | |
| embeddings, documents, _ = get_embeddings_for_clustering(collection) | |
| cluster_labels, df = perform_clustering( | |
| embeddings, | |
| documents, | |
| n_clusters=NUMBER_CLUSTERS | |
| ) | |
| analyze_query_similarity_matrix(embeddings, documents) | |
| visualize_clusters(embeddings, cluster_labels, documents) | |
| def setup_chroma_db(): | |
| """Initialize ChromaDB client and collection""" | |
| client = chromadb.Client() | |
| try: | |
| client.delete_collection("sql_queries") | |
| except Exception as e: | |
| print(f"Error deleting collection: {e}") | |
| collection = client.create_collection( | |
| name="sql_queries", | |
| metadata={"description": "SQL queries for similarity analysis"} | |
| ) | |
| return client, collection | |
| def store_queries_in_chroma(collection, queries): | |
| """Store SQL queries in ChromaDB""" | |
| ids = [f"query_{i}" for i in range(len(queries))] | |
| collection.add( | |
| documents=queries, | |
| ids=ids, | |
| metadatas=[ | |
| {"type": "sql_query", "index": i} | |
| for i in range(len(queries)) | |
| ] | |
| ) | |
| print(f"Stored {len(queries)} SQL queries in ChromaDB") | |
| def find_similar_queries(collection, query, n_results=3): | |
| """Find similar queries using ChromaDB""" | |
| results = collection.query( | |
| query_texts=[query], | |
| n_results=n_results | |
| ) | |
| print(f"\nQuery: {query}") | |
| print("Similar queries:") | |
| for i, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])): | |
| similarity = 1 - distance | |
| print(f" {i+1}. Similarity: {similarity:.3f} - {doc}") | |
| def get_embeddings_for_clustering(collection): | |
| """Get embeddings from ChromaDB for clustering""" | |
| results = collection.get() | |
| documents = results['documents'] | |
| ids = results['ids'] | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings = model.encode(documents) | |
| return embeddings, documents, ids | |
| def perform_clustering(embeddings, documents, n_clusters=5): | |
| """Perform K-means clustering on SQL queries""" | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| cluster_labels = kmeans.fit_predict(embeddings) | |
| df = pd.DataFrame({ | |
| 'query': documents, | |
| 'cluster': cluster_labels | |
| }) | |
| print(f"\nClustering Results ({n_clusters} clusters):") | |
| for cluster_id in range(n_clusters): | |
| cluster_queries = df[df['cluster'] == cluster_id]['query'].tolist() | |
| print(f"\nCluster {cluster_id} ({len(cluster_queries)} queries):") | |
| for query in cluster_queries: | |
| print(f" - {query}") | |
| return cluster_labels, df | |
| def visualize_clusters(embeddings, cluster_labels, documents): | |
| tsne = TSNE( | |
| n_components=2, | |
| random_state=42, | |
| perplexity=min(30, len(documents)-1) | |
| ) | |
| embeddings_2d = tsne.fit_transform(embeddings) | |
| plt.figure(figsize=(12, 8)) | |
| scatter = plt.scatter( | |
| embeddings_2d[:, 0], | |
| embeddings_2d[:, 1], | |
| c=cluster_labels, | |
| # cmap='tab10', | |
| alpha=0.7 | |
| ) | |
| for i, _ in enumerate(documents): | |
| label = f'query_{i}' | |
| plt.annotate( | |
| label, | |
| (embeddings_2d[i, 0], embeddings_2d[i, 1]), | |
| fontsize=8, | |
| alpha=0.7 | |
| ) | |
| plt.title('SQL Queries Clustering Visualization') | |
| plt.xlabel('t-SNE Component 1') | |
| plt.ylabel('t-SNE Component 2') | |
| plt.colorbar(scatter, label='Cluster Label') | |
| plt.tight_layout() | |
| plt.savefig('sql_queries_clustering.png') | |
| def analyze_query_similarity_matrix(embeddings, documents): | |
| similarity_matrix = cosine_similarity(embeddings) | |
| print("\nTop 5 Most Similar Query Pairs:") | |
| print("=" * 50) | |
| indices = np.triu_indices_from(similarity_matrix, k=1) | |
| similarities = similarity_matrix[indices] | |
| top_indices = np.argsort(similarities)[-5:][::-1] | |
| for idx in top_indices: | |
| i, j = indices[0][idx], indices[1][idx] | |
| similarity = similarities[idx] | |
| print(f"Similarity: {similarity:.3f}") | |
| print(f" Query 1: {documents[i]}") | |
| print(f" Query 2: {documents[j]}") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment