Skip to content

Instantly share code, notes, and snippets.

@qti3e
Last active June 18, 2026 22:37
Show Gist options
  • Select an option

  • Save qti3e/6cc4bd37434dc496b61bb3f7cd000548 to your computer and use it in GitHub Desktop.

Select an option

Save qti3e/6cc4bd37434dc496b61bb3f7cd000548 to your computer and use it in GitHub Desktop.
JOE's strlen
// © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.
//! Runtime CPU feature detection. The `setup` function from this code must be called at the very
//! start of the process initialization.
use core::fmt::Debug;
use core::fmt::Formatter;
use core::sync::atomic::AtomicU64;
use core::sync::atomic::Ordering;
static CACHE: AtomicU64 = AtomicU64::new(0);
#[cfg(target_arch = "x86_64")]
pub(super) fn setup() {
use core::arch::x86_64::__cpuid;
use core::arch::x86_64::__cpuid_count;
use core::arch::x86_64::_xgetbv;
use core::arch::x86_64::CpuidResult;
#[inline]
fn maybe(out: &mut u64, f: Feature, reg: u32, bit: u8) {
if reg & (1 << bit) != 0 {
*out |= 1 << (f as u8);
}
}
let max_basic_leaf = unsafe { __cpuid(0) }.eax;
if max_basic_leaf < 1 {
return;
}
let mut out = 0u64;
// CPUID leaf 1: processor info and feature bits
let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(1) };
// EDX features
maybe(&mut out, Feature::sse2, edx, 26);
// ECX features
maybe(&mut out, Feature::sse3, ecx, 0);
maybe(&mut out, Feature::sse4_1, ecx, 19);
maybe(&mut out, Feature::sse4_2, ecx, 20);
maybe(&mut out, Feature::popcnt, ecx, 23);
maybe(&mut out, Feature::rdrand, ecx, 30);
// For AVX/AVX2/AVX512, we need:
// 1. OSXSAVE (ECX bit 27) - OS has enabled XSAVE
// 2. XCR0 bits to verify OS saves the relevant state
let osxsave = ecx & (1 << 27) != 0;
if osxsave && max_basic_leaf >= 7 {
// XCR0 register via XGETBV
let xcr0 = unsafe { _xgetbv(0) };
// XCR0[2:1] = 11b means OS saves XMM and YMM state (required for AVX)
let avx_os_support = (xcr0 & 0b110) == 0b110;
// XCR0[7:5] = 111b means OS saves opmask, ZMM_Hi256, Hi16_ZMM (required for AVX512)
let avx512_os_support = avx_os_support && (xcr0 & 0b1110_0000) == 0b1110_0000;
let CpuidResult { ebx, .. } = unsafe { __cpuid_count(7, 0) };
maybe(&mut out, Feature::bmi2, ebx, 8);
if avx_os_support {
maybe(&mut out, Feature::avx2, ebx, 5);
}
if avx512_os_support {
maybe(&mut out, Feature::avx512f, ebx, 16);
maybe(&mut out, Feature::avx512bw, ebx, 30);
maybe(&mut out, Feature::avx512vl, ebx, 31);
}
}
CACHE.store(out, Ordering::Release);
}
#[cfg(not(target_arch = "x86_64"))]
pub fn setup() {
// NOOP for now.
}
#[cfg(target_arch = "x86_64")]
#[derive(Copy, Clone, Debug)]
#[allow(non_camel_case_types)]
pub enum Feature {
sse2,
sse3,
sse4_1,
sse4_2,
popcnt,
rdrand,
avx2,
avx512f,
avx512bw,
avx512vl,
bmi2,
}
#[derive(Copy, Clone, Debug)]
#[cfg(not(target_arch = "x86_64"))]
pub enum Feature {}
/// A set of features that can be detected either during compile time or runtime.
///
/// Note: Try to construct your [`FeatureSet`] as a `const MY_FEATURE_SET: ...` first and then use
/// `.detected()` on it, this will help the compiler with more aggressive dead code elimination in
/// case all of the features are known during compile time.
#[derive(Copy, Clone)]
pub struct FeatureSet(u64);
impl FeatureSet {
pub const EMPTY: Self = Self(0);
/// Returns the set of detected features.
pub fn active() -> Self {
Self(CACHE.load(Ordering::Acquire))
}
/// Returns the set of features that were active during compile time.
pub const fn compile_time() -> Self {
const fn visit(n: u64, feature: Feature) -> u64 {
if feature.check_cfg() {
n | (1 << (feature as u8))
} else {
n
}
}
#[cfg(target_arch = "x86_64")]
let n = {
let mut n: u64 = 0;
n = visit(n, Feature::sse2);
n = visit(n, Feature::sse3);
n = visit(n, Feature::sse4_1);
n = visit(n, Feature::sse4_2);
n = visit(n, Feature::popcnt);
n = visit(n, Feature::rdrand);
n = visit(n, Feature::avx2);
n = visit(n, Feature::avx512f);
n = visit(n, Feature::avx512bw);
n = visit(n, Feature::avx512vl);
n
};
#[cfg(not(target_arch = "x86_64"))]
let n = 0u64;
Self(n)
}
#[inline(always)]
pub const fn new(feature: Feature) -> Self {
Self::EMPTY.add(feature)
}
#[inline(always)]
pub const fn add(self, feature: Feature) -> Self {
// Avoid inserting the features we know are present at compile time
// into the feature set. `detected` for an empty feature set will
// always return true and inlines into:
//
// ```plain
// let x = unsafe { CACHE };
// (x & 0) == 0
// ```
//
// And the compiler can trivially see that this will be true for any
// value in the CACHE which will basically be replaced by a constant
// and nice things will have and branches will be eliminated.
//
// Which is really nice for `-Ctarget-cpu=native` builds.
if feature.check_cfg() {
return self;
}
let n = 1u64 << (feature as u8);
Self(self.0 | n)
}
/// Returns `true` if all of the features in this feature set are found.
#[inline(always)]
pub fn detected(self) -> bool {
if cfg!(feature = "no-rt-feature-detect") {
return self.0 == 0;
}
// Note: Currently LLVM is capable to realize that `setup` strictly dominates
// `detected` and that `CACHE` is a runtime constant and it treats it as such
// which means `detected()` gets hoisted and loops using it can get unrolled.
//
// This is a hack I was going to do if that wasn't the case. I'm gonna leave
// it here in case this changes.
//
// cfg_match! {
// // This is a hack
// target_arch = "x86_64" => {
// unsafe {
// asm!("mov {0:r}, [{1}]",
// lateout(reg) x,
// sym CACHE,
// options(nomem, nostack, pure))
// }
// }
// _ => {
// x = unsafe { CACHE }
// }
// }
let x = CACHE.load(Ordering::Acquire);
(x & self.0) == self.0
}
}
impl Feature {
#[inline(always)]
#[cfg(target_arch = "x86_64")]
const fn check_cfg(self) -> bool {
match self {
Feature::sse2 => cfg!(target_feature = "sse2"),
Feature::sse3 => cfg!(target_feature = "sse3"),
Feature::sse4_1 => cfg!(target_feature = "sse4.1"),
Feature::sse4_2 => cfg!(target_feature = "sse4.2"),
Feature::popcnt => cfg!(target_feature = "popcnt"),
Feature::rdrand => cfg!(target_feature = "rdrand"),
Feature::avx2 => cfg!(target_feature = "avx2"),
Feature::avx512f => cfg!(target_feature = "avx512f"),
Feature::avx512bw => cfg!(target_feature = "avx512bw"),
Feature::avx512vl => cfg!(target_feature = "avx512vl"),
Feature::bmi2 => cfg!(target_feature = "bmi2"),
}
}
#[inline(always)]
#[cfg(not(target_arch = "x86_64"))]
const fn check_cfg(self) -> bool {
true
}
}
impl Debug for FeatureSet {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
let mut s = f.debug_set();
#[cfg(target_arch = "x86_64")]
{
if self.0 & (1 << (Feature::sse2 as u8)) != 0 {
s.entry(&Feature::sse2);
}
if self.0 & (1 << (Feature::sse3 as u8)) != 0 {
s.entry(&Feature::sse3);
}
if self.0 & (1 << (Feature::sse4_1 as u8)) != 0 {
s.entry(&Feature::sse4_1);
}
if self.0 & (1 << (Feature::sse4_2 as u8)) != 0 {
s.entry(&Feature::sse4_2);
}
if self.0 & (1 << (Feature::popcnt as u8)) != 0 {
s.entry(&Feature::popcnt);
}
if self.0 & (1 << (Feature::rdrand as u8)) != 0 {
s.entry(&Feature::rdrand);
}
if self.0 & (1 << (Feature::avx2 as u8)) != 0 {
s.entry(&Feature::avx2);
}
if self.0 & (1 << (Feature::avx512f as u8)) != 0 {
s.entry(&Feature::avx512f);
}
if self.0 & (1 << (Feature::avx512bw as u8)) != 0 {
s.entry(&Feature::avx512bw);
}
if self.0 & (1 << (Feature::avx512vl as u8)) != 0 {
s.entry(&Feature::avx512vl);
}
if self.0 & (1 << (Feature::bmi2 as u8)) != 0 {
s.entry(&Feature::bmi2);
}
}
s.finish()
}
}
/// CPU feature-based function dispatch, like GNU ifunc but in pure Rust.
///
/// On first call, detects CPU features and selects the best implementation.
/// Subsequent calls go directly through a cached function pointer.
///
/// # Example
///
/// ```ignore
/// cpufn! {
/// /// Compute string length with CPU-specific optimizations.
/// pub unsafe fn strlen(s: *const u8) -> usize {
/// #[cfg(target_arch = "x86_64")]
/// [avx512bw, avx512vl] => strlen_avx512,
/// #[cfg(target_arch = "x86_64")]
/// [avx2] => strlen_avx2,
/// _ => strlen_generic,
/// }
/// }
/// ```
#[macro_export]
macro_rules! cpufn {
// Safe fn variant
(
$(#[$fn_attr:meta])*
$vis:vis fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
$(
$(#[$arm_attr:meta])*
[$($feature:ident),*] => $impl:path,
)*
_ => $default:path $(,)?
}
) => {
$(#[$fn_attr])*
#[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
#[inline(always)]
$vis fn $name( $($arg : $argty),* ) $(-> $ret)? {
use core::sync::atomic::AtomicPtr;
use core::sync::atomic::Ordering;
type Fn = fn( $($argty),* ) $(-> $ret)?;
static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ());
#[cold]
#[inline(never)]
#[allow(unused)]
fn resolve( $($arg : $argty),* ) $(-> $ret)? {
use $crate::sys::detect::Feature;
use $crate::sys::detect::FeatureSet;
let f: Fn = 'select: {
$(
$(#[$arm_attr])*
if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
break 'select $impl;
}
)*
$default
};
F.store(f as *const () as *mut (), Ordering::Release);
f( $($arg),* )
}
let f: Fn = unsafe {
core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire))
};
f( $($arg),* )
}
};
// Unsafe fn variant
(
$(#[$fn_attr:meta])*
$vis:vis unsafe fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
$(
$(#[$arm_attr:meta])*
[$($feature:ident),*] => $impl:path,
)*
_ => $default:path $(,)?
}
) => {
$(#[$fn_attr])*
#[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
#[inline(always)]
$vis unsafe fn $name( $($arg : $argty),* ) $(-> $ret)? {
use core::sync::atomic::AtomicPtr;
use core::sync::atomic::Ordering;
type Fn = unsafe fn( $($argty),* ) $(-> $ret)?;
static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ());
#[cold]
#[inline(never)]
#[allow(unused)]
unsafe fn resolve( $($arg : $argty),* ) $(-> $ret)? {
use $crate::sys::detect::Feature;
use $crate::sys::detect::FeatureSet;
let f: Fn = 'select: {
$(
$(#[$arm_attr])*
if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
break 'select $impl;
}
)*
$default
};
F.store(f as *const () as *mut (), Ordering::Release);
f( $($arg),* )
}
let f: Fn = core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire));
f( $($arg),* )
}
};
}
#[cfg(test)]
mod tests {
fn add_generic(a: i32, b: i32) -> i32 {
a + b
}
#[cfg(target_arch = "x86_64")]
fn add_avx2(a: i32, b: i32) -> i32 {
a + b + 1000 // marker to prove we picked this one
}
cpufn! {
/// Test function for cpufn dispatch.
fn test_add(a: i32, b: i32) -> i32 {
#[cfg(target_arch = "x86_64")]
[avx2] => add_avx2,
_ => add_generic,
}
}
#[test]
fn test_cpufn_dispatch() {
let result = test_add(2, 3);
#[cfg(target_arch = "x86_64")]
if super::FeatureSet::new(super::Feature::avx2).detected() {
assert_eq!(result, 1005); // 2 + 3 + 1000
} else {
assert_eq!(result, 5);
}
#[cfg(not(target_arch = "x86_64"))]
assert_eq!(result, 5);
}
}
// © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.
use core::ffi::c_char;
use crate::cpufn;
cpufn! {
#[unsafe(no_mangle)]
pub unsafe fn strlen(s: *const c_char) -> usize {
#[cfg(target_arch = "x86_64")]
[avx512f, avx512bw] => strlen_avx512,
#[cfg(target_arch = "x86_64")]
[avx2] => strlen_avx2,
_ => strlen_magic,
}
}
unsafe fn strlen_magic(str: *const c_char) -> usize {
let mut s = str;
while (s as usize) & 7 != 0 {
if *s == 0 {
return s.offset_from_unsigned(str);
}
s = s.add(1);
}
// Now use the glibc/Sean Eron Anderson trick to scan word at a time.
// We assume 64-bit systems in Joe's crt.
let mut w: *const u64 = s as *const _;
loop {
let v = *w;
// Check if the word has a zero in it. It may also have a 0x80 at the high byte.
let maybe_has_zero: u64 = (v.wrapping_add(0x7efefefefefefeff) ^ !v) & 0x8101010101010100;
if maybe_has_zero != 0 {
let ch: *const c_char = w as *const _;
if *ch == 0 {
return ch.offset_from_unsigned(str);
} else if *ch.add(1) == 0 {
return ch.offset_from_unsigned(str) + 1;
} else if *ch.add(2) == 0 {
return ch.offset_from_unsigned(str) + 2;
} else if *ch.add(3) == 0 {
return ch.offset_from_unsigned(str) + 3;
} else if *ch.add(4) == 0 {
return ch.offset_from_unsigned(str) + 4;
} else if *ch.add(5) == 0 {
return ch.offset_from_unsigned(str) + 5;
} else if *ch.add(6) == 0 {
return ch.offset_from_unsigned(str) + 6;
} else if *ch.add(7) == 0 {
return ch.offset_from_unsigned(str) + 7;
}
}
// load the next word.
w = w.add(1);
}
// TODO: Maybe not a bad idea to do the above loop until ptr is 32-byte aligned and then use
// the avx code?
}
#[cfg(target_arch = "x86_64")]
unsafe fn strlen_avx2(str: *const c_char) -> usize {
let out: usize;
core::arch::asm!(
include_str!("./strlen_x86_64_avx2.s"),
in("rdi") str,
lateout("rax") out,
lateout("ymm0") _,
lateout("ymm1") _,
lateout("ymm2") _,
lateout("rdi") _,
lateout("rdx") _,
lateout("rcx") _,
options(nostack, readonly)
);
out
}
#[cfg(target_arch = "x86_64")]
unsafe fn strlen_avx512(str: *const c_char) -> usize {
let out: usize;
core::arch::asm!(
include_str!("./strlen_x86_64_avx512.s"),
in("rdi") str,
lateout("rax") out,
lateout("zmm0") _,
lateout("zmm1") _,
lateout("zmm2") _,
lateout("rdi") _,
lateout("rdx") _,
lateout("rcx") _,
lateout("k1") _,
lateout("k2") _,
options(nostack, readonly)
);
out
}
#[cfg(bench)]
#[cfg(target_arch = "x86_64")]
mod bench {
use super::*;
use crate::selftest::micro::*;
#[bench]
fn strlen_bench() {
use crate::sys::detect::Feature;
use crate::sys::detect::FeatureSet;
let mut buffer = [1; 1024 * 8];
buffer[buffer.len() - 1] = 0;
let mut size = 1;
const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);
const AVX512: FeatureSet = FeatureSet::EMPTY
.add(Feature::avx512f)
.add(Feature::avx512bw);
struct Input {
len: usize,
ptr: *const i8,
}
let input = empty::<Input>()
.set_throughput("byte", |v| v.len)
.chain_with(move || {
buffer[size - 1] = 32;
size *= 2;
if size <= buffer.len() {
buffer[size - 1] = 0;
Some(Input {
len: size - 1,
ptr: buffer.as_ptr(),
})
} else {
None
}
});
let fns = &[
BenchFn::new("avx2", |input: &Input| unsafe { strlen_avx2(input.ptr) })
.skip_if(!AVX2.detected()),
BenchFn::new("avx512", |input: &Input| unsafe {
strlen_avx512(input.ptr)
})
.skip_if(!AVX512.detected()),
BenchFn::new("magic", |input| unsafe { strlen_magic(input.ptr) }),
];
BenchConfig::default().set_name("strlen").run(input, fns);
}
}
#[cfg(test)]
mod test {
use core::ops::Deref;
use core::ops::DerefMut;
use super::*;
const CAP: usize = 1024 * 5;
#[repr(align(512))]
struct Buffer([u8; CAP]);
impl Deref for Buffer {
type Target = [u8];
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Buffer {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
#[inline(never)]
fn test_strlen_impl(name: &str, f: impl Fn(*const c_char) -> usize) {
let mut alloc = Buffer([0; CAP]);
assert_eq!((alloc.0.as_ptr() as usize) % 64, 0);
let test_cases = [
257, 511, 512, 513, 1023, 1024, 1025, 2047, 2048, 2049, 4095, 4096, 4097, 4098, 5000,
];
for align_offset in 0..=512 {
for len in (0..=256).chain(test_cases) {
if align_offset + len + 1 >= alloc.len() {
continue; // Skip out-of-bound cases
}
unsafe {
assert_eq!(
(alloc.0.as_ptr().add(align_offset) as usize) % 64,
align_offset % 64,
"failed for align={align_offset}"
);
}
// Fill with non-zero data
for b in alloc.iter_mut() {
*b = 1;
}
// Set the bytes before the input to zero, this tests implementations that
// load an entire aligned chunk of data in initial step.
for i in 0..align_offset {
alloc[i] = 0;
}
// Set null terminator at the correct location
alloc[align_offset + len] = 0;
alloc[align_offset + len + 1] = 0;
// Pointer to the beginning of the test string
let s = unsafe { alloc.as_ptr().add(align_offset) as *const c_char };
let result = f(s);
assert_eq!(
result,
len,
"Failed for align={} align_offset={}, len={}, got={}",
unsafe { (alloc.0.as_ptr().add(align_offset) as usize) % 512 },
align_offset,
len,
result
);
}
}
for i in 0..512 {
alloc[i] = 0;
let s = unsafe { alloc.as_ptr().add(i) as *const c_char };
let result = f(s);
assert_eq!(
result, 0,
"{name}: Failed zero-terminator check at offset {}: got {}",
i, result
);
}
}
#[test]
fn test_strlen_magic() {
test_strlen_impl("magic", |x| unsafe { strlen_magic(x) });
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_strlen_x86_avx2() {
use crate::sys::detect::Feature;
use crate::sys::detect::FeatureSet;
const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);
if !AVX2.detected() {
return crate::selftest::ignore();
}
test_strlen_impl("x86_64-avx2", |x| unsafe { strlen_avx2(x) });
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_strlen_x86_avx512() {
use crate::sys::detect::Feature;
use crate::sys::detect::FeatureSet;
const AVX512: FeatureSet = FeatureSet::EMPTY
.add(Feature::avx512f)
.add(Feature::avx512bw);
if !AVX512.detected() {
return crate::selftest::ignore();
}
test_strlen_impl("x86_64-avx512", |x| unsafe { strlen_avx512(x) });
}
#[test]
fn test_strlen_default() {
test_strlen_impl("default", |x| unsafe { strlen(x) });
}
}
# Registers used:
# ymm0 -> constant zero
# ymm1 -> loaded value AND result of compare against zero
# ymm2 -> loaded value AND result of compare against zero
#
# rax -> the original input ptr AND the return value
# rdi -> our 32-byte aligned pointer which moves forward
#
# edx -> the finalized simd mask AND tzcnt of the mask
# ecx -> alignment padding = original_ptr % 32
# Keep the original input in rax we don't need it until computing the return value.
mov rax, rdi
# n = rcx, ecx = (original ptr) & 31 ; our initial alignment padding.
mov rcx, rdi
and rcx, 31
# ymm0 = ZERO
vpxor ymm0, ymm0, ymm0
# If already 32-byte aligned skip the partial step
test rcx, rcx
jz 20f
# align the ptr to 32-byte boundary
and rdi, -32
# mask = to_mask(cmp(*ptr, ZERO))
vmovdqa ymm1, [rdi]
vpcmpeqb ymm1, ymm1, ymm0
vpmovmskb edx, ymm1
# now we have the first 32-bytes in the memory but this may include some other
# bytes that appear before the `str`. So we need to clear some of the LSBs
# mask >>= rcx
shr edx, cl
# if the mask is zero then go to the loop there were no zeros here
test edx, edx
jz 21f
# there is at least one bit set so we found our zero now we have to calculate the length.
tzcnt eax, edx
jmp 90f
# if we're here the first 32-byte did not contain a zero.
# and rdi is 32-byte aligned.
21:
add rdi, 32
# ====================== 32-byte aligned
20:
test rdi, 63
jz 30f
# mask = to_mask(cmp(*ptr, ZERO))
vmovdqa ymm1, [rdi]
vpcmpeqb ymm1, ymm1, ymm0
vpmovmskb edx, ymm1
test edx, edx
jnz 70f
add rdi, 32
# ===================== 64-byte aligned
30:
test rdi, 127
jz 40f
vmovdqa ymm1, [rdi]
vmovdqa ymm2, [rdi + 32]
vpcmpeqb ymm1, ymm1, ymm0
vpcmpeqb ymm2, ymm2, ymm0
vpmovmskb edx, ymm1
vpmovmskb ecx, ymm2
or edx, ecx
jnz 60f
add rdi, 64
# ===================== 128-byte aligned
40:
# Chunk 1 or 2
vmovdqa ymm1, [rdi]
vmovdqa ymm2, [rdi + 32]
vpcmpeqb ymm1, ymm1, ymm0
vpcmpeqb ymm2, ymm2, ymm0
vpmovmskb edx, ymm1
vpmovmskb ecx, ymm2
or edx, ecx
jnz 60f
# Chunk 3 or 4
vmovdqa ymm1, [rdi + 64]
vmovdqa ymm2, [rdi + 96]
vpcmpeqb ymm1, ymm1, ymm0
vpcmpeqb ymm2, ymm2, ymm0
vpmovmskb edx, ymm1
vpmovmskb ecx, ymm2
or edx, ecx
jnz 50f
45:
add rdi, 128
jmp 40b
# ================================== END
# ==== Found in ymm1 or ymm2 in chunk 3 or 4
50:
# Re-extract masks since we lost them in the 'or'
vmovdqa ymm1, [rdi + 64]
vpcmpeqb ymm1, ymm1, ymm0
vpmovmskb edx, ymm1
test edx, edx
jz 52f
# Found in chunk 3
add rdi, 64
jmp 70f
# Found in chunk 4
52:
vmovdqa ymm2, [rdi + 96]
vpcmpeqb ymm2, ymm2, ymm0
vpmovmskb edx, ymm2
tzcnt edx, edx
add rdi, 96
jmp 80f
# ==== Found in ymm1 or ymm2 in chunk 1 or 2
60:
# Re-extract masks since we lost them in the 'or'
vmovdqa ymm1, [rdi]
vpcmpeqb ymm1, ymm1, ymm0
vpmovmskb edx, ymm1
test edx, edx
jz 62f
# Found in chunk 1
jmp 70f
# Found in chunk 2
62:
vmovdqa ymm2, [rdi + 32]
vpcmpeqb ymm2, ymm2, ymm0
vpmovmskb edx, ymm2
tzcnt edx, edx
add rdi, 32
jmp 80f
# ==== Found in ymm1 in chunk 1
70:
tzcnt edx, edx
# length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
80:
sub rdi, rax
add rdi, rdx
mov rax, rdi
90:
vzeroupper
# Registers used:
# zmm0 -> constant zero
# zmm1 -> loaded value AND result of compare against zero
# zmm2 -> loaded value AND result of compare against zero
#
# rax -> the original input ptr AND the return value
# rdi -> our 64-byte aligned pointer which moves forward
#
# edx -> the finalized simd mask AND tzcnt of the mask
# ecx -> alignment padding = original_ptr % 64
# Keep the original input in rax we don't need it until computing the return value.
mov rax, rdi
# n = rcx, ecx = (original ptr) & 63 ; our initial alignment padding.
mov rcx, rdi
and rcx, 63
# zmm0 = ZERO
vpxord zmm0, zmm0, zmm0
# If already 64-byte aligned skip the partial step, which could lead us
# to the 256-byte aligned sooner.
test rcx, rcx
jz 20f
# TODO: This step could be improved to avoid loeading too much
# garbage data.
# align the ptr to 64-byte boundary
and rdi, -64
# mask = to_mask(cmp(*ptr, ZERO))
vmovdqa32 zmm1, [rdi]
vpcmpb k1, zmm1, zmm0, 0
kmovq rdx, k1
# now we have the first 64-bytes in the memory but this may include some other
# bytes that appear before the `str`. So we need to clear some of the LSBs
# mask >>= rcx
shr rdx, cl
# if the mask is zero then go to the loop there were no zeros here
test rdx, rdx
# NOTE we don't do runtime target detection for tzcnt, and if not supported it decodes
# to `bsr` and bsr and tzcnt are the same if the input is not zero.
# so we do this explicit check against zero here in this branch before jumping to the
# tzcnt.
jz 21f
# there is at least one bit set so we found our zero now we have to calculate the length.
# we don't need the mask anymore just need to know how many trailing zeros it has.
tzcnt rax, rdx
jmp 90f
# if we're here the first 64-byte did not contain a zero.
# and rdi is 64-byte aligned.
21:
add rdi, 64
# ====================== 64-byte aligned
20:
test rdi, 127
jz 30f
# mask = to_mask(cmp(*ptr, ZERO))
vmovdqa32 zmm1, [rdi]
vpcmpb k1, zmm1, zmm0, 0
kortestq k1, k1
jnz 70f
add rdi, 64
# ===================== 128-byte aligned
30:
test rdi, 255
jz 40f
vmovdqa32 zmm1, [rdi]
vmovdqa32 zmm2, [rdi + 64]
vpcmpb k1, zmm1, zmm0, 0
vpcmpb k2, zmm2, zmm0, 0
kortestq k1, k2
jnz 60f
add rdi, 128
# ===================== 256-byte aligned
40:
# Chunk 1 or 2
vmovdqa32 zmm1, [rdi]
vmovdqa32 zmm2, [rdi + 64]
vpcmpb k1, zmm1, zmm0, 0
vpcmpb k2, zmm2, zmm0, 0
kortestq k1, k2
jnz 60f
# Chunk 3 or 4
vmovdqa32 zmm1, [rdi + 128]
vmovdqa32 zmm2, [rdi + 192]
vpcmpb k1, zmm1, zmm0, 0
vpcmpb k2, zmm2, zmm0, 0
kortestq k1, k2
jnz 50f
45:
add rdi, 256
jmp 40b
# ================================== END
# ==== Found in k1 or k2 in chunk 3 or 4
50:
kortestq k1, k1
jz 52f
# Found in chunk 3
add rdi, 128
jmp 70f
# Found in chunk 4
52:
kmovq rdx, k2
tzcnt rdx, rdx
add rdi, 192
jmp 80f
# ==== Found in k1 or k2 in chunk 1 or 2
60:
kortestq k1, k1
jz 62f
# Found in chunk 1
jmp 70f
# Found in chunk 2
62:
kmovq rdx, k2
tzcnt rdx, rdx
add rdi, 64
jmp 80f
# ==== Found in k1 in chunk 1
70:
kmovq rdx, k1
tzcnt rdx, rdx
# length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
80:
sub rdi, rax
add rdi, rdx
mov rax, rdi
90:
vzeroupper
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment