Created
December 11, 2023 01:08
-
-
Save bendangelo/53f36aff187e64fb0357f6f1c10223f7 to your computer and use it in GitHub Desktop.
Add Chromadb methods to your Rails Models. This is a model concern.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add to model: | |
# include Chromable | |
# chroma do | |
# hnsw_space :cosine | |
# embedding :name | |
# document :label | |
# metadata :name | |
# end | |
# Usage: | |
# Hint.upsert Hint.all | |
# @hints = Hint.chroma_query "dogs", where: {name: {"$ne": "dogs"}} | |
# Implement own embedding service: embeddings = EmbeddingsService.call texts | |
# I used https://github.com/michaelfeil/infinity | |
module Chromable | |
extend ActiveSupport::Concern | |
included do | |
class_attribute :chroma_configuration | |
end | |
class_methods do | |
def chroma(&block) | |
self.chroma_configuration = ChromaConfiguration.new | |
self.chroma_configuration.instance_eval(&block) if block_given? | |
end | |
def chroma_query query, results: 10, where: {}, where_document: {}, include: %w[documents] | |
embedding = EmbeddingsService.call query | |
col = chroma_collection | |
col.query query_embeddings: [embedding[0]], results: results, where: where, where_document: where_document, include: include | |
end | |
def chroma_collection | |
@@col ||= Chroma::Resources::Collection.get_or_create self.chroma_collection_name, { | |
"hnsw:space": self.chroma_configuration.hnsw_space_param.to_s | |
} | |
end | |
def chroma_delete | |
Chroma::Resources::Collection.delete chroma_collection_name | |
end | |
def chroma_collection_name | |
if Rails.env.test? | |
"#{self.table_name}_test" | |
else | |
self.table_name | |
end | |
end | |
def chroma_count | |
chroma_collection.count | |
end | |
def chroma_delete ids: nil, where: {}, where_document: {} | |
if ids.present? | |
ids = ids.map {|i| i.to_s } | |
end | |
chroma_collection.delete ids: ids, where: where, where_document: where_document | |
end | |
def chroma_get(ids: nil, where: {}, sort: nil, limit: nil, offset: nil, page: nil, page_size: nil, where_document: {}, include: %w[documents]) | |
if ids.present? | |
ids = ids.map {|i| i.to_s } | |
end | |
chroma_collection.get ids: ids, where: where, sort: sort, limit: limit, offset: offset, page: page, page_size: page_size, where_document: where_document, include: include | |
end | |
def chroma_upsert items | |
texts = items.pluck self.chroma_configuration.embedding_name.to_sym | |
embeddings = EmbeddingsService.call texts | |
objs = items.map.with_index do |h, i| | |
h.to_embedding embeddings[i] | |
end | |
chroma_collection.upsert objs | |
end | |
end | |
def to_embedding embedding=nil | |
embedding_hash = {id: id.to_s, embedding: embedding} | |
if self.class.chroma_configuration | |
embedding_hash.merge!(self.class.chroma_configuration.to_embedding_hash(self)) | |
end | |
Chroma::Resources::Embedding.new **embedding_hash | |
end | |
class ChromaConfiguration | |
attr_accessor :embedding_name, :document_name, :metadata_names, :hnsw_space_param | |
def initialize | |
self.hnsw_space_param = :l2 | |
end | |
def embedding(name) | |
self.embedding_name = name | |
end | |
def document(name) | |
self.document_name = name | |
end | |
def hnsw_space(name) | |
self.hnsw_space_param = name | |
end | |
def metadata(*names) | |
self.metadata_names = names | |
end | |
def to_embedding_hash(model_instance) | |
embedding_hash = {} | |
if document_name | |
embedding_hash[:document] = model_instance.send(document_name) | |
end | |
if metadata_names | |
metadata_hash = {} | |
metadata_names.each do |metadata_name| | |
metadata_hash[metadata_name.to_s] = model_instance.send(metadata_name) | |
end | |
embedding_hash[:metadata] = metadata_hash | |
end | |
embedding_hash | |
end | |
end | |
end |
What's the collection type set as? L2 or cosine? And how many dimensions do you have for each embedding? I had 384
I'm using the default chromadb settings, and my embedding size is 1024.
Ok I'll test it again.
I'm using the default chromadb settings, and my embedding size is 1024.
I've found the issue. Searching takes a lot of cpu resources, so if the server is constrained for cpu time searching will slow to a crawl (postgres and other services don't have this issue). There must be some optimization on Chroma's side to fix this.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I just tried with 100K documents and the query takes ~40ms. I think there is something wrong with your setup. Are you using the chroma-db gem notebook as is?