Last active
September 6, 2024 07:01
-
-
Save Validark/dcf8d59caf462af875a388462b7fab63 to your computer and use it in GitHub Desktop.
Interleaved Vector compress on arm/aarch64
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fn tbl4( | |
table_part_1: @Vector(16, u8), | |
table_part_2: @Vector(16, u8), | |
table_part_3: @Vector(16, u8), | |
table_part_4: @Vector(16, u8), | |
indices: @Vector(8, u8) | |
) @TypeOf(indices) { | |
return struct { | |
extern fn @"llvm.aarch64.neon.tbl4"(@TypeOf(table_part_1), @TypeOf(table_part_2), @TypeOf(table_part_3), @TypeOf(table_part_4), @TypeOf(indices)) @TypeOf(indices); | |
}.@"llvm.aarch64.neon.tbl4"(table_part_1, table_part_2, table_part_3, table_part_4, indices); | |
} | |
export fn compress(bitstring: u64, chunk0: @Vector(16, u8), chunk1: @Vector(16, u8), chunk2: @Vector(16, u8), chunk3: @Vector(16, u8), dest: [*]u8) void { | |
comptime var lookups: [256]@Vector(8, u8) = undefined; | |
@setEvalBranchQuota(100000); | |
comptime { | |
for (&lookups, 0..) |*slot, i| { | |
var pos: u8 = 0; | |
for (0..8) |bit_i| { | |
const bit: u1 = @truncate(i >> bit_i); | |
if (bit == 1) { | |
slot[pos] = bit_i / 4 + (bit_i & 3) * 16; | |
} | |
pos += bit; | |
} | |
for (pos..8) |j| { | |
slot[j] = 255; | |
} | |
} | |
} | |
const prefix_sum_of_popcounts = @as(u64, @bitCast(@as(@Vector(8, u8), @popCount(@as(@Vector(8, u8), @bitCast(bitstring)))))) *% 0x0101010101010101; | |
inline for (@as([8]u8, @bitCast(bitstring)), @as([8]u8, @bitCast(prefix_sum_of_popcounts)), 0..) |byte, pos, i| { | |
dest[pos..][0..8].* = tbl4(chunk0, chunk1, chunk2, chunk3, lookups[byte] +| @as(@Vector(8, u8), @splat(2*i))); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment