Created
May 3, 2026 13:49
-
-
Save giannitedesco/c1f66bfc69e17d0e9ce535ef4e6c1fb2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #pragma once | |
| #include <stdint.h> | |
| #define IDX_KEY_WIDTH 32U | |
| #define IDX_FANOUT_BITS 10U | |
| #define IDX_FANOUT (1U << IDX_FANOUT_BITS) | |
| #define IDX_FANOUT_MASK (IDX_FANOUT - 1) | |
| /* Maximum depth of the tree */ | |
| #define IDX_LEVELS ((IDX_KEY_WIDTH + IDX_FANOUT_BITS - 1) \ | |
| / IDX_FANOUT_BITS) | |
| /* File format: | |
| * | |
| * The start of the file is structured as a sequence of fixed-size pages. Pages | |
| * are `uint32_t * IDX_FANOUT` bytes in size. | |
| * | |
| * header: Page zero is a header. | |
| * - We store lowest and highest keys so we can quickly exclude point queries | |
| * which lie outside of that range without any further I/O. | |
| * - We store the first leaf so that we can jump ahead to scan all values | |
| * without recursing the whole tree. | |
| * - We store the last leaf so that during lookup we know to avoid searching in | |
| * any of the empty space beyond the last key. This means that we don't need | |
| * to include any per-node header. | |
| * - We store a magic number so that we can identify the file | |
| * - There is a lot of unused space that we can use to store checksums, or | |
| * bloom filters, or whatever else we may need. | |
| * | |
| * internal nodes: Internal nodes are just arrays of splitting keys. The | |
| * page-number of the child-node is implicit as nodes in the tree are stored in | |
| * breadth-first search (BFS) order. | |
| * | |
| * leaf nodes: Due to BFS order, leaf nodes are contiguous and due to b-tree | |
| * structure the keys are in order. Therefore the leaf-node layer is a | |
| * contiguous, page-aligned array of all keys which are in the index. | |
| * | |
| * From here on out, there is no more page-structure. | |
| * | |
| * The format of values is user-defined, and they are completely optional. | |
| * Without values, the index is capable only of set-membership checks. | |
| * | |
| * scalar values : if values are all fixed sized then we don't need any | |
| * pointers to them because we can use that known size, combined with implicit | |
| * layout, to locate the value. The size of these scalar values (which might be | |
| * zero) is stored in the header. | |
| * | |
| * If values are variably sized then the scalar part can include (or entirely | |
| * consist of) an offset into the file. | |
| * | |
| * -- page structured -- | |
| * [ header ] | |
| * [ internal nodes ] | |
| * [ leaf nodes ] | |
| * -- free-form / byte packed -- | |
| * [ OPTIONAL: scalar values ] | |
| * [ OPTIONAL: space for variably sized values ] | |
| */ | |
| struct idx_node { | |
| uint32_t key[IDX_FANOUT]; | |
| }; | |
| /* Header invariants (for recognizing a valid file): | |
| * 1. magic == IDX_MAGIC | |
| * 2. hi_key >= lo_key | |
| * 3. (hi_key - lo_key) >= nr_keys | |
| * | |
| * The ident field is user-defined. The ident and magic together identify the | |
| * type of file, its structure and interpretation. | |
| */ | |
| struct idx_hdr { | |
| uint64_t nr_keys; | |
| uint16_t ident; | |
| #define IDX_MAGIC 0x2b62 | |
| uint16_t magic; | |
| uint32_t lo_key; | |
| uint32_t hi_key; | |
| uint16_t val_size; | |
| } __attribute__((packed)); | |
| static_assert(sizeof(struct idx_hdr) <= sizeof(struct idx_node), | |
| "Header must fit inside one page"); | |
| #define IDX_SLACK_SPACE (sizeof(struct idx_node) \ | |
| - sizeof(struct idx_hdr)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment