Skip to content

Instantly share code, notes, and snippets.

@Tetralux
Last active December 10, 2024 17:16
Show Gist options
  • Save Tetralux/60a6f8079a55a5e76ba53e3122510d0d to your computer and use it in GitHub Desktop.
Save Tetralux/60a6f8079a55a5e76ba53e3122510d0d to your computer and use it in GitHub Desktop.
A way to serialise strings along with a blob
/*
A short demonstration of how atomized strings (strings turned into single ints to make comparison faster and storage simpler) can be serialised into a stream (or file),
alongside the blobs that refer to them, without doing anything more than just memcpy'ing everything into the stream.
It achieves this using the following techniques:
1) Atoms are stored in the Person structs only. They are i32s.
2) Atomizer turns strings into i32s by copying their length (u32le) into a buffer, followed by the string data. The current offset into the buffer is the atom for that string.
3) Both the slice of Persons and the atomizer's buffer can be directly written into a stream without alteration. See the comment in main() of the output.
Written by Tetralux 2024-10-12.
*/
package main
import "core:fmt"
import "core:bytes"
import "core:mem"
import "core:io"
import "core:encoding/endian"
import "core:slice"
Person :: struct {
name: Atom,
race: Race,
}
Race :: enum u8 {
UNKNOWN = 0,
HUMAN = 1,
ORC = 2,
DRAGON = 3,
CAT = 4,
}
Atom :: i32
Atomizer :: struct {
data: [dynamic]byte,
names: map[string]Atom,
}
atomize_safe :: proc(x: ^Atomizer, s: string) -> (atom: Atom, err: mem.Allocator_Error) {
if name, found := x.names[s]; found {
return name, nil
}
atom = i32(len(x.data))
length := u32le(len(s))
length_bytes := transmute([4]byte)length
append(&x.data, ..length_bytes[:]) or_return
bytes := transmute([]byte)s
append(&x.data, ..bytes) or_return
x.names[s] = atom
return atom, nil
}
atomize :: proc(x: ^Atomizer, s: string) -> Atom {
atom, err := atomize_safe(x, s)
assert(err == nil)
return atom
}
deatomize_safe :: proc(x: ^Atomizer, atom: Atom) -> (s: string, found: bool) {
if atom < 0 || int(atom) >= len(x.data) {
return "", false
}
data := x.data[atom:]
length, ok := endian.get_u32(data, .Little)
assert(ok)
bytes := data[size_of(u32le):][:length]
return string(bytes), true
}
deatomize :: proc(x: ^Atomizer, atom: Atom) -> string {
s, found := deatomize_safe(x, atom)
assert(found)
return s
}
main :: proc() {
atomizer: Atomizer
persons := []Person {
{ name = atomize(&atomizer, "Sarah"), race = .DRAGON },
{ name = atomize(&atomizer, "Stephanie"), race = .CAT },
{ name = atomize(&atomizer, "Bob"), race = .HUMAN },
}
buffer: bytes.Buffer
w := bytes.buffer_to_stream(&buffer)
write_persons_to_stream(w, persons)
write_name_data(w, &atomizer)
fmt.printfln("%v", bytes.buffer_to_bytes(&buffer))
// [3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 9, 0, 0, 0, 4, 0, 0, 0, 22, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 83, 97, 114, 97, 104, 9, 0, 0, 0, 83, 116, 101, 112, 104, 97, 110, 105, 101, 3, 0, 0, 0, 66, 111, 98]
// ^^^^^^^^^^^ ^^^^^^^^^^ ^ ^^^^^^^^ ^^^^^^^^^^^ ^ ^^^^^^^^ ^^^^^^^^^^^^ ^ ^^^^^^^^ ^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^ ^^^^^^^^^^^
// len(persons) atom race | atom race | atom race | len(Sarah) "Sarah" | "Stephanie" len(Bob) "Bob"
// padding padding padding len(Stephanie)
new_persons, _, load_persons_err := read_persons_from_stream(w)
assert(load_persons_err == nil)
new_atomizer: Atomizer
_, load_names_err := read_name_data(w, &new_atomizer)
assert(load_names_err == nil)
fmt.printfln("new_persons=%v atoms=%v", new_persons, new_atomizer.names)
// new_persons=[Person{name = 0, race = "DRAGON"}, Person{name = 9, race = "CAT"}, Person{name = 22, race = "HUMAN"}] atoms=map[Bob=22, Sarah=0, Stephanie=9]
}
write_persons_to_stream :: proc(w: io.Stream, persons: []Person) -> (written: int, err: io.Error) {
length := u32le(len(persons))
length_bytes := transmute([4]byte)length
io.write_full(w, length_bytes[:]) or_return
bytes := slice.to_bytes(persons)
written = io.write_full(w, bytes) or_return
return
}
write_name_data :: proc(w: io.Stream, x: ^Atomizer) -> (written: int, err: io.Error) {
bytes := x.data[:]
written, err = io.write_full(w, bytes)
return
}
read_persons_from_stream :: proc(r: io.Stream, allocator := context.allocator) -> (persons: []Person, bytes_read: int, err: io.Error) {
length_bytes: [4]byte
length_length := io.read_full(r, length_bytes[:]) or_return
bytes_read += length_length
assert(length_length == size_of(u32le))
length, _ := endian.get_u32(length_bytes[:], .Little)
people: [dynamic]Person
people.allocator = allocator
resize(&people, length)
for &person in people {
person_length := io.read_full(r, mem.ptr_to_bytes(&person)) or_return
bytes_read += person_length
assert(person_length == size_of(Person))
}
persons = people[:]
return
}
read_name_data :: proc(r: io.Stream, x: ^Atomizer) -> (bytes_read: int, err: io.Error) {
clear(&x.data)
clear(&x.names)
offset := 0
for {
length_bytes: [4]byte
length_length, length_length_err := io.read_full(r, length_bytes[:])
if length_length_err == .EOF {
break
} else if length_length_err != nil {
err = length_length_err
return
}
assert(length_length == size_of(u32le))
bytes_read += length_length
length, _ := endian.get_u32(length_bytes[:], .Little)
atom := offset
offset += length_length
offset += int(length)
append(&x.data, ..length_bytes[:])
start := len(x.data)
resize(&x.data, start + int(length))
dst := x.data[start:][:length]
string_length, string_length_err := io.read_full(r, dst)
if string_length_err == .EOF {
break
} else if string_length_err != nil {
err = string_length_err
return
}
assert(string_length == int(length))
s := string(dst)
x.names[s] = Atom(atom)
}
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment