filmo · July 12, 2025 21:40
diff --git a/flat_vs_chroma.py b/flat_vs_chroma.py
 # llm is a valid OpenAI|Ollama object
 # Settings.embed_model is a valid "nomic-embed-text" embedding model producing 768-d vectors

 # This code works and 'default_vector_store.json' contains the embeddings 
 documents = SimpleDirectoryReader(
 	"data/DIR_WORD_DOCs",
 	filename_as_id=False,
 	file_extractor={'.docx':DocxReader()}
 ).load_data(num_workers=num_workers)

 default_pt_paragraph_seperator = '\n\n'
 text_splitter = SentenceSplitter(
 	chunk_size=512,
 	chunk_overlap=20,
 	paragraph_separator=default_pt_paragraph_seperator
 )
 Settings.text_splitter = text_splitter

 title_extractor  = TitleExtractor( nodes=5,llm=llm, num_workers=num_workers )
 qa_extractor     = QuestionsAnsweredExtractor( questions=3, llm=llm, num_workers=num_workers,)

 vector_index = VectorStoreIndex.from_documents(
 	documents,
 	transformations=[text_splitter, qa_extractor, title_extractor],
 	embed_model = Settings.embed_model,
 )
 vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2')

 # If I run querys against this I get top_k nodes that have rational similarity scores like 0.7023 or similar


 # The chromadb example below DOES NOT work. 

 db = chromadb.PersistentClient(path="data/chroma_db")
 chroma_collection = db.get_or_create_collection("my_collection")
 vector_store    = ChromaVectorStore(chroma_collection=chroma_collection)
 storage_context = StorageContext.from_defaults(vector_store=vector_store)

 documents = SimpleDirectoryReader(
 	"../data/DIR_WORD_DOCs",
 	filename_as_id=False,
 	file_extractor={'.docx':DocxReader()}
 ).load_data(num_workers=num_workers)

 default_pt_paragraph_seperator = '\n\n'
 text_splitter = SentenceSplitter(
 	chunk_size=512, chunk_overlap=20,
 	paragraph_separator=default_pt_paragraph_seperator
 )
 Settings.text_splitter = text_splitter

 title_extractor  = TitleExtractor(nodes=5,llm=llm, num_workers=num_workers )
 qa_extractor = QuestionsAnsweredExtractor(questions=3,llm=llm, num_workers=num_workers)

 vector_index = VectorStoreIndex.from_documents(
 	documents,
 	transformations=[text_splitter, qa_extractor, title_extractor],
 	storage_context=storage_context,
 	embed_model = Settings.embed_model,
 )

 # I can see that it has stored 80 Nodes into the chromadb (which is the same number of nodes as the first example) so it appears
 # that all the processing work is happending correctly.
 # When I run queries against the loaded chromadb data, all similarity scores are 0.000000

 # its as if there are no embeddings accociated with the nodes stored in the chromadb ???

  
 # when I use just the vanialla vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2') from the first example, 
 # my similarity scores on searches I know should work return reasonable values in the range of 0.65 to 0.80 and I get valid
 # search results.

 # for queries against the chromadb I always get some vairiant of "There's nothing in the context matching the query'
	# llm is a valid OpenAI\|Ollama object
	# Settings.embed_model is a valid "nomic-embed-text" embedding model producing 768-d vectors

	# This code works and 'default_vector_store.json' contains the embeddings
	documents = SimpleDirectoryReader(
	"data/DIR_WORD_DOCs",
	filename_as_id=False,
	file_extractor={'.docx':DocxReader()}
	).load_data(num_workers=num_workers)

	default_pt_paragraph_seperator = '\n\n'
	text_splitter = SentenceSplitter(
	chunk_size=512,
	chunk_overlap=20,
	paragraph_separator=default_pt_paragraph_seperator
	)
	Settings.text_splitter = text_splitter

	title_extractor = TitleExtractor( nodes=5,llm=llm, num_workers=num_workers )
	qa_extractor = QuestionsAnsweredExtractor( questions=3, llm=llm, num_workers=num_workers,)

	vector_index = VectorStoreIndex.from_documents(
	documents,
	transformations=[text_splitter, qa_extractor, title_extractor],
	embed_model = Settings.embed_model,
	)
	vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2')

	# If I run querys against this I get top_k nodes that have rational similarity scores like 0.7023 or similar


	# The chromadb example below DOES NOT work.

	db = chromadb.PersistentClient(path="data/chroma_db")
	chroma_collection = db.get_or_create_collection("my_collection")
	vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	documents = SimpleDirectoryReader(
	"../data/DIR_WORD_DOCs",
	filename_as_id=False,
	file_extractor={'.docx':DocxReader()}
	).load_data(num_workers=num_workers)

	default_pt_paragraph_seperator = '\n\n'
	text_splitter = SentenceSplitter(
	chunk_size=512, chunk_overlap=20,
	paragraph_separator=default_pt_paragraph_seperator
	)
	Settings.text_splitter = text_splitter

	title_extractor = TitleExtractor(nodes=5,llm=llm, num_workers=num_workers )
	qa_extractor = QuestionsAnsweredExtractor(questions=3,llm=llm, num_workers=num_workers)

	vector_index = VectorStoreIndex.from_documents(
	documents,
	transformations=[text_splitter, qa_extractor, title_extractor],
	storage_context=storage_context,
	embed_model = Settings.embed_model,
	)

	# I can see that it has stored 80 Nodes into the chromadb (which is the same number of nodes as the first example) so it appears
	# that all the processing work is happending correctly.
	# When I run queries against the loaded chromadb data, all similarity scores are 0.000000

	# its as if there are no embeddings accociated with the nodes stored in the chromadb ???


	# when I use just the vanialla vector_index.storage_context.persist(persist_dir='data/RAG_Storage_2') from the first example,
	# my similarity scores on searches I know should work return reasonable values in the range of 0.65 to 0.80 and I get valid
	# search results.

	# for queries against the chromadb I always get some vairiant of "There's nothing in the context matching the query'