Last active
July 12, 2025 21:40
-
-
Save filmo/626cb097c0dd14f83529a8fb69702a2b to your computer and use it in GitHub Desktop.
problem getting embeddings to store in chroma_db
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# llm is a valid OpenAI|Ollama object | |
# Settings.embed_model is a valid "nomic-embed-text" embedding model producing 768-d vectors | |
# This code works and 'default_vector_store.json' contains the embeddings | |
documents = SimpleDirectoryReader( | |
"data/DIR_WORD_DOCs", | |
filename_as_id=False, | |
file_extractor={'.docx':DocxReader()} | |
).load_data(num_workers=num_workers) | |
default_pt_paragraph_seperator = '\n\n' | |
text_splitter = SentenceSplitter( | |
chunk_size=512, | |
chunk_overlap=20, | |
paragraph_separator=default_pt_paragraph_seperator | |
) | |
Settings.text_splitter = text_splitter | |
title_extractor = TitleExtractor( nodes=5,llm=llm, num_workers=num_workers ) | |
qa_extractor = QuestionsAnsweredExtractor( questions=3, llm=llm, num_workers=num_workers,) | |
vector_index = VectorStoreIndex.from_documents( | |
documents, | |
transformations=[text_splitter, qa_extractor, title_extractor], | |
embed_model = Settings.embed_model, | |
) | |
vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2') | |
# If I run querys against this I get top_k nodes that have rational similarity scores like 0.7023 or similar | |
# The chromadb example below DOES NOT work. | |
db = chromadb.PersistentClient(path="data/chroma_db") | |
chroma_collection = db.get_or_create_collection("my_collection") | |
vector_store = ChromaVectorStore(chroma_collection=chroma_collection) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
documents = SimpleDirectoryReader( | |
"../data/DIR_WORD_DOCs", | |
filename_as_id=False, | |
file_extractor={'.docx':DocxReader()} | |
).load_data(num_workers=num_workers) | |
default_pt_paragraph_seperator = '\n\n' | |
text_splitter = SentenceSplitter( | |
chunk_size=512, chunk_overlap=20, | |
paragraph_separator=default_pt_paragraph_seperator | |
) | |
Settings.text_splitter = text_splitter | |
title_extractor = TitleExtractor(nodes=5,llm=llm, num_workers=num_workers ) | |
qa_extractor = QuestionsAnsweredExtractor(questions=3,llm=llm, num_workers=num_workers) | |
vector_index = VectorStoreIndex.from_documents( | |
documents, | |
transformations=[text_splitter, qa_extractor, title_extractor], | |
storage_context=storage_context, | |
embed_model = Settings.embed_model, | |
) | |
# I can see that it has stored 80 Nodes into the chromadb (which is the same number of nodes as the first example) so it appears | |
# that all the processing work is happending correctly. | |
# When I run queries against the loaded chromadb data, all similarity scores are 0.000000 | |
# its as if there are no embeddings accociated with the nodes stored in the chromadb ??? | |
# when I use just the vanialla vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2') from the first example, | |
# my similarity scores on searches I know should work return reasonable values in the range of 0.65 to 0.80 and I get valid | |
# search results. | |
# for queries against the chromadb I always get some vairiant of "There's nothing in the context matching the query' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment