Last active
July 18, 2023 22:40
-
-
Save alexdong/006f2761ac2add592843bbc47489e4da to your computer and use it in GitHub Desktop.
Measure the performance of loading 10,000 Embeddings with 1024 Float32 from a binary data file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Accelerate | |
import Dispatch | |
import Foundation | |
import os | |
let logger = Logger(subsystem: "photos.keepers", category: "EmbeddingVectorSearchEngine") | |
typealias Embedding = [Float32] | |
typealias Distance = Float32 | |
final class EmbeddingVectorSearchEngine { | |
static let shared = EmbeddingVectorSearchEngine() | |
let fileURL: URL | |
var uuids: [UUID] = [] | |
var embeddings: [Embedding] = [] | |
// private init prevents others from using the default initializer for this class | |
private init() { | |
let containerURL = FileManager.default.containerURL(forSecurityApplicationGroupIdentifier: "group.photos.keepers.app")! | |
fileURL = containerURL.appendingPathComponent("Embeddings.dat") | |
} | |
public func insert(_ inputs: [(UUID, Embedding)]) { | |
/// First, update the in-memory records, then write it to disk | |
uuids += inputs.map { $0.0 } | |
embeddings += inputs.map { $0.1 } | |
var data = Data() | |
for (uuid, embedding) in inputs { | |
let uuidTuple = uuid.uuid | |
let uuidBytes: [UInt8] = [ | |
uuidTuple.0, uuidTuple.1, uuidTuple.2, uuidTuple.3, | |
uuidTuple.4, uuidTuple.5, uuidTuple.6, uuidTuple.7, | |
uuidTuple.8, uuidTuple.9, uuidTuple.10, uuidTuple.11, | |
uuidTuple.12, uuidTuple.13, uuidTuple.14, uuidTuple.15 | |
] | |
data.append(contentsOf: uuidBytes) | |
for floatVal in embedding { | |
let floatValData = withUnsafeBytes(of: floatVal) { Data($0) } | |
data.append(floatValData) | |
} | |
} | |
if FileManager.default.fileExists(atPath: fileURL.path()) { | |
if let fileUpdater = try? FileHandle(forWritingTo: fileURL) { | |
fileUpdater.seekToEndOfFile() | |
fileUpdater.write(data) | |
fileUpdater.closeFile() | |
} | |
} else { | |
try! data.write(to: fileURL) | |
} | |
} | |
public func load() { | |
let data = try! Data(contentsOf: fileURL, options: .alwaysMapped) | |
let frameWidth = 16 + 1024 * 4 | |
let totalEmbeddings = data.count / frameWidth // calculate total number of embeddings to process | |
// Preallocate memory with a placeholder UUID | |
// This function preallocates memory for performance reasons. Preallocating memory: | |
// 1) Helps to avoid memory fragmentation: Allocating all needed memory at once makes it more likely | |
// that the data will be stored contiguously in memory, improving access speed. | |
// 2) Avoids a synchronisation point: When appending to an array from multiple threads, we need to | |
// synchronize access to the array which can become a bottleneck. By preallocating, each thread | |
// can write to a distinct region of the array, eliminating the need for synchronization. | |
let placeholderUUID = UUID() | |
uuids = [UUID](repeating: placeholderUUID, count: totalEmbeddings) | |
embeddings = [Embedding](repeating: Embedding(repeating: 0.0, count: 1024), count: totalEmbeddings) | |
DispatchQueue.concurrentPerform(iterations: totalEmbeddings) { index in | |
let start = data.index(data.startIndex, offsetBy: index * frameWidth) | |
let uuidEnd = data.index(start, offsetBy: 16) | |
let uuidData = data[start..<uuidEnd] | |
let uuidBytes = Array(uuidData) | |
uuids[index] = UUID(uuid: ( | |
uuidBytes[0], uuidBytes[1], uuidBytes[2], uuidBytes[3], | |
uuidBytes[4], uuidBytes[5], uuidBytes[6], uuidBytes[7], | |
uuidBytes[8], uuidBytes[9], uuidBytes[10], uuidBytes[11], | |
uuidBytes[12], uuidBytes[13], uuidBytes[14], uuidBytes[15] | |
)) | |
var embedding = Embedding(repeating: 0.0, count: 1024) | |
var floatStart = uuidEnd | |
for i in 0..<1024 { | |
let floatEnd = data.index(floatStart, offsetBy: 4) | |
let floatData = data[floatStart..<floatEnd] | |
floatStart = floatEnd | |
embedding[i] = floatData.withUnsafeBytes { $0.load(as: Float32.self) } | |
} | |
embeddings[index] = embedding | |
} | |
} | |
public func find(input: Embedding, threshold: Distance) -> [UUID] { | |
// The `find` function computes the similarity score between a given vector and a list of embedding vectors. | |
// It runs tasks concurrently, which can result in a significant speedup. The number of | |
// concurrent tasks is set to be three times the number of available CPU cores. | |
// | |
// Each task is responsible for a specific chunk of the embeddings list. An embedding vector is considered a match if | |
// its similarity score with the input vector is greater than a given threshold. | |
let numOfCores = ProcessInfo.processInfo.activeProcessorCount | |
let numOfTasks = numOfCores * 3 | |
let chunkSize = (embeddings.count + numOfTasks - 1) / numOfTasks | |
var results: [Int] = [] | |
let resultQueue = DispatchQueue(label: "photos.keepers.vectorSearch.find", attributes: .concurrent) | |
DispatchQueue.concurrentPerform(iterations: numOfTasks) { taskIndex in | |
let startIndex = taskIndex * chunkSize | |
let endIndex = min(startIndex + chunkSize, embeddings.count) | |
for i in startIndex..<endIndex { | |
let distance = similarityScore(Array(input), embeddings[i]) | |
if distance > threshold { | |
resultQueue.sync(flags: .barrier) { | |
results.append(i) | |
} | |
} | |
} | |
} | |
resultQueue.sync { } | |
return results.map { uuids[$0] } | |
} | |
private func similarityScore(_ vector1: Embedding, _ vector2: Embedding) -> Distance { | |
/* vDSP uses CPU's SIMD to perform the calculations. We could also use Metal Shader | |
and Matrix operation to avoid using a for loop to call `similarityScore` for each. | |
However, given the number of embeddings and the complexities of the Metal Shader, | |
let's stick with vDSP for now. | |
*/ | |
var dotProduct: Float = 0.0 | |
var vector1Norm: Float = 0.0 | |
var vector2Norm: Float = 0.0 | |
let count = vDSP_Length(vector1.count) | |
vDSP_dotpr(vector1, 1, vector2, 1, &dotProduct, count) | |
vDSP_svesq(vector1, 1, &vector1Norm, count) | |
vDSP_svesq(vector2, 1, &vector2Norm, count) | |
return dotProduct / (sqrt(vector1Norm) * sqrt(vector2Norm)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment