Skip to content

Instantly share code, notes, and snippets.

@DevWael
Created June 11, 2025 09:15
Show Gist options
  • Save DevWael/ca178a99d4c1370f43a891c039d77f09 to your computer and use it in GitHub Desktop.
Save DevWael/ca178a99d4c1370f43a891c039d77f09 to your computer and use it in GitHub Desktop.
File-based vector storage using sharding with BINARY storage for vectors.
<?php
/**
* Class SimpleShardedVectorStoreBinary
*
* File-based vector storage using sharding with BINARY storage for vectors.
* Includes:
* - Binary format for shards (potentially faster I/O & parsing, smaller files).
* - Optional vector normalization for faster search.
* - Pre-calculated magnitudes stored.
* - Transactional writes for shards (temp file + rename).
* - Metadata filtering during search.
* - Batch adding of vectors.
*
* LIMITATIONS:
* - Search is still a LINEAR SCAN across all vectors. No true ANN indexing.
* - Concurrency, error recovery, advanced DB features are not comparable to dedicated vector DBs.
*/
class SimpleShardedVectorStoreBinary {
private string $storageDir;
private int $numShards;
private bool $normalizeVectors;
private const SHARD_PREFIX = 'vector_shard_bin_'; // Indicate binary
private const SHARD_SUFFIX = '.dat'; // Data file
private const TEMP_SUFFIX = '.tmp';
private const FLOAT_EPSILON = 1e-9;
// Binary format constants (adjust if needed)
// 'f' = single-precision float (4 bytes). 'd' = double-precision float (8 bytes).
// Most embeddings are float32, so 'f' is usually appropriate.
private const VECTOR_PACK_FORMAT = 'f'; // For individual vector elements
private const MAGNITUDE_PACK_FORMAT = 'f'; // For storing the magnitude
// Metadata types for binary storage
private const METADATA_TYPE_NULL = 0;
private const METADATA_TYPE_JSON_STRING = 1; // Store metadata as a JSON string
public function __construct( string $storageDir, int $numShards = 10, bool $normalizeVectors = true ) {
if ( $numShards <= 0 ) {
throw new \InvalidArgumentException( "Number of shards must be positive." );
}
$this->storageDir = rtrim( $storageDir, '/\\' );
$this->numShards = $numShards;
$this->normalizeVectors = $normalizeVectors;
if ( ! is_dir( $this->storageDir ) ) {
if ( ! mkdir( $this->storageDir, 0777, true ) ) {
throw new \RuntimeException( "Failed to create storage directory: {$this->storageDir}" );
}
}
if ( ! is_writable( $this->storageDir ) ) {
throw new \RuntimeException( "Storage directory is not writable: {$this->storageDir}" );
}
}
private function getShardIndex( string $id ): int {
return abs( crc32( $id ) ) % $this->numShards;
}
private function getShardPath( int $shardIndex ): string {
return $this->storageDir . '/' . self::SHARD_PREFIX . $shardIndex . self::SHARD_SUFFIX;
}
/**
* Loads data from a binary shard file.
*
* @param int $shardIndex
*
* @return array The data from the shard.
* @throws \RuntimeException If the file exists but cannot be read or parsed.
*/
private function loadShard( int $shardIndex ): array {
$filePath = $this->getShardPath( $shardIndex );
if ( ! file_exists( $filePath ) || filesize( $filePath ) === 0 ) {
return array();
}
$binaryContent = file_get_contents( $filePath );
if ( $binaryContent === false ) {
throw new \RuntimeException( "Failed to read shard file: {$filePath}" );
}
$items = array();
$offset = 0;
$contentLength = strlen( $binaryContent );
$floatSize = ( $this::VECTOR_PACK_FORMAT === 'f' ) ? 4 : 8;
while ( $offset < $contentLength ) {
// 1. ID Length (uint16)
if ( $offset + 2 > $contentLength ) {
break;
} // Not enough data for ID length
$idLenData = unpack( 'n', substr( $binaryContent, $offset, 2 ) );
$idLength = $idLenData[1];
$offset += 2;
// 2. ID (string)
if ( $offset + $idLength > $contentLength ) {
break;
} // Not enough data for ID
$id = substr( $binaryContent, $offset, $idLength );
$offset += $idLength;
// 3. Vector Dimension (uint16)
if ( $offset + 2 > $contentLength ) {
break;
} // Not enough data for dimension
$dimData = unpack( 'n', substr( $binaryContent, $offset, 2 ) );
$dimension = $dimData[1];
$offset += 2;
// 4. Vector Data (array of floats)
$vectorByteLength = $dimension * $floatSize;
if ( $offset + $vectorByteLength > $contentLength ) {
break;
} // Not enough data for vector
$vectorBinary = substr( $binaryContent, $offset, $vectorByteLength );
$vector = array_values( unpack( $this::VECTOR_PACK_FORMAT . $dimension, $vectorBinary ) ); // array_values to re-index
$offset += $vectorByteLength;
// 5. Magnitude (float/double)
if ( $offset + $floatSize > $contentLength ) {
break;
} // Not enough data for magnitude
$magData = unpack( $this::MAGNITUDE_PACK_FORMAT . '1val', substr( $binaryContent, $offset, $floatSize ) );
$magnitude = $magData['val'];
$offset += $floatSize;
// 6. Metadata Type (byte)
if ( $offset + 1 > $contentLength ) {
break;
} // Not enough data for metadata type
$metaTypeData = unpack( 'C', substr( $binaryContent, $offset, 1 ) );
$metadataType = $metaTypeData[1];
$offset += 1;
$metadata = null;
if ( $metadataType === self::METADATA_TYPE_JSON_STRING ) {
// 7. Metadata Length (uint32)
if ( $offset + 4 > $contentLength ) {
break;
} // Not enough for metadata length
$metaLenData = unpack( 'N', substr( $binaryContent, $offset, 4 ) );
$metadataLength = $metaLenData[1];
$offset += 4;
// 8. Metadata (JSON string)
if ( $offset + $metadataLength > $contentLength ) {
break;
} // Not enough for metadata
$metadataJson = substr( $binaryContent, $offset, $metadataLength );
$offset += $metadataLength;
$metadata = json_decode( $metadataJson, true );
if ( json_last_error() !== JSON_ERROR_NONE ) {
trigger_error( "Failed to decode metadata JSON for ID '{$id}' in shard {$shardIndex}: " . json_last_error_msg(), E_USER_WARNING );
$metadata = array( '_parsing_error' => json_last_error_msg() ); // Store error instead of failing all
}
}
$items[] = array(
'id' => $id,
'vector' => $vector,
'dimension' => $dimension, // Store dimension for consistency checks
'magnitude' => $magnitude,
'metadata' => $metadata,
);
}
if ( $offset !== $contentLength && $contentLength > 0 ) {
trigger_error( "Shard file {$filePath} may be corrupt or incompletely read. Read {$offset} of {$contentLength} bytes.", E_USER_WARNING );
}
return $items;
}
/**
* Saves data to a binary shard file using transactional writes.
*
* @param int $shardIndex
* @param array $shardData
*
* @throws \RuntimeException If saving fails.
*/
private function saveShard( int $shardIndex, array $shardData ): void {
$shardPath = $this->getShardPath( $shardIndex );
$tempPath = $this->storageDir . '/' . self::SHARD_PREFIX . $shardIndex . uniqid( '_temp_', true ) . self::TEMP_SUFFIX;
$binaryOutput = '';
foreach ( $shardData as $item ) {
// 1. ID
$id = (string) ( $item['id'] ?? '' );
$binaryOutput .= pack( 'n', strlen( $id ) ); // ID Length (uint16)
$binaryOutput .= $id; // ID (string)
// 2. Vector
$vector = $item['vector'] ?? array();
$dimension = count( $vector );
$binaryOutput .= pack( 'n', $dimension ); // Vector Dimension (uint16)
if ( $dimension > 0 ) {
// The '*' in pack format string for arrays applies to remaining arguments
// So we need to pass vector elements as separate arguments using '...' (splat operator)
$binaryOutput .= pack( $this::VECTOR_PACK_FORMAT . '*', ...$vector ); // Vector data
}
// 3. Magnitude
$magnitude = (float) ( $item['magnitude'] ?? 0.0 );
$binaryOutput .= pack( $this::MAGNITUDE_PACK_FORMAT, $magnitude ); // Magnitude (float/double)
// 4. Metadata
$metadata = $item['metadata'] ?? null;
if ( $metadata === null ) {
$binaryOutput .= pack( 'C', self::METADATA_TYPE_NULL ); // Metadata Type (byte)
} else {
$binaryOutput .= pack( 'C', self::METADATA_TYPE_JSON_STRING ); // Metadata Type (byte)
$metadataJson = json_encode( $metadata );
if ( $metadataJson === false ) { // Should not happen if metadata is valid PHP
trigger_error( "Failed to JSON encode metadata for ID '{$id}': " . json_last_error_msg(), E_USER_WARNING );
$metadataJson = '{"_encoding_error":"' . json_last_error_msg() . '"}';
}
$binaryOutput .= pack( 'N', strlen( $metadataJson ) ); // Metadata Length (uint32)
$binaryOutput .= $metadataJson; // Metadata (JSON string)
}
}
if ( file_put_contents( $tempPath, $binaryOutput ) === false ) {
if ( file_exists( $tempPath ) ) {
@unlink( $tempPath );
}
throw new \RuntimeException( "Failed to write to temporary shard file: {$tempPath}" );
}
if ( ! rename( $tempPath, $shardPath ) ) {
if ( file_exists( $tempPath ) ) {
@unlink( $tempPath );
}
throw new \RuntimeException( "Failed to rename temporary shard file '{$tempPath}' to '{$shardPath}'." );
}
}
public function addVector( string $id, array $vector, $metadata = null ): bool {
// Basic validation
if ( empty( $vector ) || ! is_array( $vector ) ) { /* ... error checks ... */
return false;
}
foreach ( $vector as $val ) {
if ( ! is_numeric( $val ) ) { /* ... error checks ... */
return false;
}
}
$shardIndex = $this->getShardIndex( $id );
$shardData = $this->loadShard( $shardIndex );
foreach ( $shardData as $item ) {
if ( $item['id'] === $id ) {
return false;
} // ID already exists
}
$magnitude = $this->magnitude( $vector );
$vectorToStore = $vector;
$magnitudeToStore = $magnitude;
$dimension = count( $vector );
if ( $this->normalizeVectors ) {
if ( $magnitude > self::FLOAT_EPSILON ) {
$vectorToStore = $this->normalizeVector( $vector, $magnitude );
$magnitudeToStore = 1.0;
} else {
$magnitudeToStore = 0.0;
}
}
$shardData[] = array(
'id' => $id,
'vector' => $vectorToStore,
'dimension' => $dimension,
'magnitude' => $magnitudeToStore,
'metadata' => $metadata,
);
$this->saveShard( $shardIndex, $shardData );
return true;
}
/**
* Adds multiple vectors in a batch. More efficient than single adds for large amounts.
*
* @param array $vectorsData Array of ['id' => ..., 'vector' => ..., 'metadata' => ...]
*
* @return array ['succeeded' => count, 'failed_ids' => [...], 'duplicate_ids' => [...]]
*/
public function addVectorsBatch( array $vectorsData ): array {
$results = array( 'succeeded' => 0, 'failed_ids' => array(), 'duplicate_ids' => array() );
$vectorsByShard = array();
// Group vectors by shard
foreach ( $vectorsData as $vData ) {
if ( ! isset( $vData['id'], $vData['vector'] ) || empty( $vData['vector'] ) || ! is_array( $vData['vector'] ) ) {
$results['failed_ids'][] = $vData['id'] ?? 'unknown_id_missing_vector';
continue;
}
// Basic numeric check for vector elements
$validVector = true;
foreach ( $vData['vector'] as $val ) {
if ( ! is_numeric( $val ) ) {
$validVector = false;
break;
}
}
if ( ! $validVector ) {
$results['failed_ids'][] = $vData['id'];
continue;
}
$shardIndex = $this->getShardIndex( $vData['id'] );
$vectorsByShard[ $shardIndex ][] = $vData;
}
// Process each shard
foreach ( $vectorsByShard as $shardIndex => $shardVectors ) {
$currentShardData = $this->loadShard( $shardIndex );
$existingIdsInShard = array_column( $currentShardData, 'id' );
$vectorsToAddThisShard = array();
foreach ( $shardVectors as $vData ) {
if ( in_array( $vData['id'], $existingIdsInShard ) ) {
$results['duplicate_ids'][] = $vData['id'];
continue;
}
$vector = $vData['vector'];
$metadata = $vData['metadata'] ?? null;
$id = $vData['id'];
$magnitude = $this->magnitude( $vector );
$vectorToStore = $vector;
$magnitudeToStore = $magnitude;
$dimension = count( $vector );
if ( $this->normalizeVectors ) {
if ( $magnitude > self::FLOAT_EPSILON ) {
$vectorToStore = $this->normalizeVector( $vector, $magnitude );
$magnitudeToStore = 1.0;
} else {
$magnitudeToStore = 0.0;
}
}
$vectorsToAddThisShard[] = array(
'id' => $id,
'vector' => $vectorToStore,
'dimension' => $dimension,
'magnitude' => $magnitudeToStore,
'metadata' => $metadata,
);
$existingIdsInShard[] = $id; // Prevent adding same ID twice in one batch
}
if ( ! empty( $vectorsToAddThisShard ) ) {
$newShardData = array_merge( $currentShardData, $vectorsToAddThisShard );
try {
$this->saveShard( $shardIndex, $newShardData );
$results['succeeded'] += count( $vectorsToAddThisShard );
} catch ( \Exception $e ) {
trigger_error( "Batch add failed for shard {$shardIndex}: " . $e->getMessage(), E_USER_WARNING );
foreach ( $vectorsToAddThisShard as $failedVec ) {
$results['failed_ids'][] = $failedVec['id'];
}
}
}
}
return $results;
}
public function removeVector( string $id ): bool { /* ... Same logic as before, relies on load/saveShard ... */
$shardIndex = $this->getShardIndex( $id );
$shardData = $this->loadShard( $shardIndex );
$initialCount = count( $shardData );
$shardData = array_filter( $shardData, fn( $item ) => $item['id'] !== $id );
if ( count( $shardData ) < $initialCount ) {
$this->saveShard( $shardIndex, $shardData );
return true;
}
return false;
}
public function getVectorById( string $id ): ?array { /* ... Same logic as before ... */
$shardIndex = $this->getShardIndex( $id );
$shardData = $this->loadShard( $shardIndex );
foreach ( $shardData as $item ) {
if ( $item['id'] === $id ) {
return $item;
}
}
return null;
}
public function findSimilar( array $queryVector, int $topK = 5, $metadataFilter = null ): array {
// ... (Validate queryVector) ...
if ( empty( $queryVector ) || ! is_array( $queryVector ) ) {
return array();
}
foreach ( $queryVector as $val ) {
if ( ! is_numeric( $val ) ) {
return array();
}
}
$allResults = array();
$queryMagnitude = $this->magnitude( $queryVector );
if ( $queryMagnitude < self::FLOAT_EPSILON ) {
return array();
}
$normalizedQueryVector = $this->normalizeVector( $queryVector, $queryMagnitude );
$queryDimension = count( $queryVector );
$filterFn = null; /* ... (Metadata filter setup from previous version) ... */
if ( is_callable( $metadataFilter ) ) {
$filterFn = $metadataFilter;
} elseif ( is_array( $metadataFilter ) && ! empty( $metadataFilter ) ) {
$filterFn = function ( $metadata ) use ( $metadataFilter ): bool {
if ( ! is_array( $metadata ) ) {
return false;
}
foreach ( $metadataFilter as $key => $value ) {
if ( ! isset( $metadata[ $key ] ) || $metadata[ $key ] !== $value ) {
return false;
}
}
return true;
};
}
for ( $i = 0; $i < $this->numShards; $i ++ ) {
$shardData = $this->loadShard( $i ); // Now loads binary data
if ( empty( $shardData ) ) {
continue;
}
foreach ( $shardData as $item ) {
// Basic validation from loaded binary data
if ( ! isset( $item['id'], $item['vector'], $item['magnitude'], $item['dimension'] ) || ! is_array( $item['vector'] ) ) {
trigger_error( "Skipping invalid item in shard {$i} after binary load.", E_USER_WARNING );
continue;
}
if ( $filterFn !== null ) { /* ... (Metadata filtering) ... */
if ( ! $filterFn( $item['metadata'] ?? null ) ) {
continue;
}
}
if ( $item['dimension'] !== $queryDimension ) { /* ... (Dimension check) ... */
trigger_error( "Dimension mismatch ID '{$item['id']}' (stored {$item['dimension']}, query {$queryDimension}). Skipping.", E_USER_WARNING );
continue;
}
$storedMagnitude = (float) $item['magnitude'];
$storedVector = $item['vector'];
if ( $storedMagnitude < self::FLOAT_EPSILON ) {
continue;
}
$similarity = 0.0;
if ( abs( $storedMagnitude - 1.0 ) < self::FLOAT_EPSILON && $this->normalizeVectors ) {
$similarity = $this->dotProduct( $normalizedQueryVector, $storedVector );
} else {
$dot = $this->dotProduct( $normalizedQueryVector, $storedVector );
$similarity = $dot / $storedMagnitude;
}
$similarity = max( - 1.0, min( 1.0, $similarity ) );
$allResults[] = array(
'id' => $item['id'],
'metadata' => $item['metadata'] ?? null,
'score' => $similarity,
);
}
unset( $shardData );
}
usort( $allResults, fn( $a, $b ) => $b['score'] <=> $a['score'] );
return array_slice( $allResults, 0, $topK );
}
// --- Helper Functions (normalizeVector, dotProduct, magnitude) - same as before ---
private function normalizeVector( array $vec, ?float $magnitude = null ): array { /* ... */
if ( $magnitude === null ) {
$magnitude = $this->magnitude( $vec );
}
if ( $magnitude < self::FLOAT_EPSILON ) {
return array_fill( 0, count( $vec ), 0.0 );
}
$normalized = array();
foreach ( $vec as $value ) {
$normalized[] = (float) $value / $magnitude;
}
return $normalized;
}
private function dotProduct( array $vec1, array $vec2 ): float { /* ... */
$result = 0.0;
$count = count( $vec1 );
for ( $i = 0; $i < $count; $i ++ ) {
if ( isset( $vec1[ $i ], $vec2[ $i ] ) && is_numeric( $vec1[ $i ] ) && is_numeric( $vec2[ $i ] ) ) {
$result += (float) $vec1[ $i ] * (float) $vec2[ $i ];
} else {
return NAN;
}
}
return $result;
}
private function magnitude( array $vec ): float { /* ... */
$sumOfSquares = 0.0;
foreach ( $vec as $value ) {
if ( is_numeric( $value ) ) {
$sumOfSquares += (float) $value * (float) $value;
} else {
return NAN;
}
}
return sqrt( max( 0.0, $sumOfSquares ) );
}
// --- Management Functions (getAllVectors, clearAll) ---
public function getAllVectors(): array { /* ... Same as before, relies on loadShard ... */
$allData = array();
for ( $i = 0; $i < $this->numShards; $i ++ ) {
$shardData = $this->loadShard( $i );
if ( ! empty( $shardData ) ) {
foreach ( $shardData as $item ) {
$allData[] = $item;
}
}
unset( $shardData );
}
return $allData;
}
public function clearAll(): bool { /* ... Same as before, but be careful with temp files if any exist ... */
$success = true;
for ( $i = 0; $i < $this->numShards; $i ++ ) {
$filePath = $this->getShardPath( $i );
if ( file_exists( $filePath ) ) {
if ( ! unlink( $filePath ) ) {
trigger_error( "Failed to delete shard file: {$filePath}", E_USER_WARNING );
$success = false;
}
}
$tempPattern = $this->storageDir . '/' . self::SHARD_PREFIX . $i . '_temp_*' . self::TEMP_SUFFIX;
foreach ( glob( $tempPattern ) as $tempFile ) {
@unlink( $tempFile );
}
}
if ( ! is_dir( $this->storageDir ) ) {
@mkdir( $this->storageDir, 0777, true );
}
return $success;
}
}
// --- Example Usage ---
$storageDirectory = './vector_store_binary_shards';
$numberOfShards = 3;
$normalizeOnAdd = true;
$vectorStore = new SimpleShardedVectorStoreBinary( $storageDirectory, $numberOfShards, $normalizeOnAdd );
echo "Using Binary Store. Normalization on Add: " . ( $normalizeOnAdd ? 'Enabled' : 'Disabled' ) . "\n";
// Optional: Clear previous data
$vectorStore->clearAll();
echo "Store cleared.\n";
$vectorsToAdd = array(
array( 'id' => 'cat1_bin', 'vector' => array( 0.1, 0.8, 0.1, 0.5 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'A' ) ),
array( 'id' => 'dog1_bin', 'vector' => array( 0.9, 0.1, 0.1, 0.2 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'B' ) ),
array( 'id' => 'sky1_bin', 'vector' => array( 0.1, 0.1, 0.9, 0.8 ), 'metadata' => array( 'topic' => 'weather', 'source' => 'C' ) ),
);
$vectorsToAddMore = array( // For batch add
array( 'id' => 'cat2_bin', 'vector' => array( 0.2, 0.7, 0.0, 0.6 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'A' ) ),
array( 'id' => 'dog2_bin', 'vector' => array( 0.8, 0.2, 0.0, 0.3 ), 'metadata' => array( 'topic' => 'animal', 'source' => 'D' ) ),
// new source
array( 'id' => 'sky2_bin', 'vector' => array( 0.0, 0.2, 0.8, 0.7 ), 'metadata' => array( 'topic' => 'weather', 'source' => 'A' ) ),
array( 'id' => 'dup_cat1_bin', 'vector' => array( 0.1, 0.1, 0.1, 0.1 ), 'metadata' => array( 'topic' => 'duplicate_test' ) ),
// this ID will be a duplicate of cat1_bin if cat1_bin is added first
);
echo "\nAdding vectors individually...\n";
foreach ( $vectorsToAdd as $v ) {
if ( $vectorStore->addVector( $v['id'], $v['vector'], $v['metadata'] ) ) {
echo " - Added ID: {$v['id']}\n";
} else {
echo " - Failed or duplicate ID: {$v['id']}\n";
}
}
echo "\nAdding vectors in batch...\n";
// Adjust vectorsToAddMore to avoid duplicates if running sequentially
// For this test, let's assume the first batch has unique IDs relative to the second.
$batchResult = $vectorStore->addVectorsBatch( $vectorsToAddMore );
echo "Batch Add Results:\n";
print_r( $batchResult );
// Test search
$queryVector = array( 0.15, 0.75, 0.05, 0.55 ); // 4D query
echo "\nSearching for vectors similar to: [" . implode( ', ', $queryVector ) . "]\n";
$similar = $vectorStore->findSimilar( $queryVector, 3 );
print_r( $similar );
// Test get by ID
$item = $vectorStore->getVectorById( 'dog1_bin' );
echo "\nRetrieved 'dog1_bin':\n";
print_r( $item );
echo "\nAll vectors (use with caution on large stores):\n";
// print_r($vectorStore->getAllVectors()); // Can be very verbose
echo "\nDone.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment