qti3e · June 18, 2026 22:37
diff --git a/detect.rs b/detect.rs
 // © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.

 //! Runtime CPU feature detection. The `setup` function from this code must be called at the very
 //! start of the process initialization.

 use core::fmt::Debug;
 use core::fmt::Formatter;
 use core::sync::atomic::AtomicU64;
 use core::sync::atomic::Ordering;

 static CACHE: AtomicU64 = AtomicU64::new(0);

 #[cfg(target_arch = "x86_64")]
 pub(super) fn setup() {
    use core::arch::x86_64::__cpuid;
    use core::arch::x86_64::__cpuid_count;
    use core::arch::x86_64::_xgetbv;
    use core::arch::x86_64::CpuidResult;

    #[inline]
    fn maybe(out: &mut u64, f: Feature, reg: u32, bit: u8) {
        if reg & (1 << bit) != 0 {
            *out |= 1 << (f as u8);
        }
    }

    let max_basic_leaf = unsafe { __cpuid(0) }.eax;

    if max_basic_leaf < 1 {
        return;
    }

    let mut out = 0u64;

    // CPUID leaf 1: processor info and feature bits
    let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(1) };

    // EDX features
    maybe(&mut out, Feature::sse2, edx, 26);

    // ECX features
    maybe(&mut out, Feature::sse3, ecx, 0);
    maybe(&mut out, Feature::sse4_1, ecx, 19);
    maybe(&mut out, Feature::sse4_2, ecx, 20);
    maybe(&mut out, Feature::popcnt, ecx, 23);
    maybe(&mut out, Feature::rdrand, ecx, 30);

    // For AVX/AVX2/AVX512, we need:
    // 1. OSXSAVE (ECX bit 27) - OS has enabled XSAVE
    // 2. XCR0 bits to verify OS saves the relevant state
    let osxsave = ecx & (1 << 27) != 0;

    if osxsave && max_basic_leaf >= 7 {
        // XCR0 register via XGETBV
        let xcr0 = unsafe { _xgetbv(0) };

        // XCR0[2:1] = 11b means OS saves XMM and YMM state (required for AVX)
        let avx_os_support = (xcr0 & 0b110) == 0b110;

        // XCR0[7:5] = 111b means OS saves opmask, ZMM_Hi256, Hi16_ZMM (required for AVX512)
        let avx512_os_support = avx_os_support && (xcr0 & 0b1110_0000) == 0b1110_0000;

        let CpuidResult { ebx, .. } = unsafe { __cpuid_count(7, 0) };

        maybe(&mut out, Feature::bmi2, ebx, 8);

        if avx_os_support {
            maybe(&mut out, Feature::avx2, ebx, 5);
        }

        if avx512_os_support {
            maybe(&mut out, Feature::avx512f, ebx, 16);
            maybe(&mut out, Feature::avx512bw, ebx, 30);
            maybe(&mut out, Feature::avx512vl, ebx, 31);
        }
    }

    CACHE.store(out, Ordering::Release);
 }

 #[cfg(not(target_arch = "x86_64"))]
 pub fn setup() {
    // NOOP for now.
 }

 #[cfg(target_arch = "x86_64")]
 #[derive(Copy, Clone, Debug)]
 #[allow(non_camel_case_types)]
 pub enum Feature {
    sse2,
    sse3,
    sse4_1,
    sse4_2,
    popcnt,
    rdrand,
    avx2,
    avx512f,
    avx512bw,
    avx512vl,
    bmi2,
 }

 #[derive(Copy, Clone, Debug)]
 #[cfg(not(target_arch = "x86_64"))]
 pub enum Feature {}

 /// A set of features that can be detected either during compile time or runtime.
 ///
 /// Note: Try to construct your [`FeatureSet`] as a `const MY_FEATURE_SET: ...` first and then use
 /// `.detected()` on it, this will help the compiler with more aggressive dead code elimination in
 /// case all of the features are known during compile time.
 #[derive(Copy, Clone)]
 pub struct FeatureSet(u64);

 impl FeatureSet {
    pub const EMPTY: Self = Self(0);

    /// Returns the set of detected features.
    pub fn active() -> Self {
        Self(CACHE.load(Ordering::Acquire))
    }

    /// Returns the set of features that were active during compile time.
    pub const fn compile_time() -> Self {
        const fn visit(n: u64, feature: Feature) -> u64 {
            if feature.check_cfg() {
                n | (1 << (feature as u8))
            } else {
                n
            }
        }

        #[cfg(target_arch = "x86_64")]
        let n = {
            let mut n: u64 = 0;
            n = visit(n, Feature::sse2);
            n = visit(n, Feature::sse3);
            n = visit(n, Feature::sse4_1);
            n = visit(n, Feature::sse4_2);
            n = visit(n, Feature::popcnt);
            n = visit(n, Feature::rdrand);
            n = visit(n, Feature::avx2);
            n = visit(n, Feature::avx512f);
            n = visit(n, Feature::avx512bw);
            n = visit(n, Feature::avx512vl);
            n
        };

        #[cfg(not(target_arch = "x86_64"))]
        let n = 0u64;

        Self(n)
    }

    #[inline(always)]
    pub const fn new(feature: Feature) -> Self {
        Self::EMPTY.add(feature)
    }

    #[inline(always)]
    pub const fn add(self, feature: Feature) -> Self {
        // Avoid inserting the features we know are present at compile time
        // into the feature set. `detected` for an empty feature set will
        // always return true and inlines into:
        //
        // ```plain
        // let x = unsafe { CACHE };
        // (x & 0) == 0
        // ```
        //
        // And the compiler can trivially see that this will be true for any
        // value in the CACHE which will basically be replaced by a constant
        // and nice things will have and branches will be eliminated.
        //
        // Which is really nice for `-Ctarget-cpu=native` builds.
        if feature.check_cfg() {
            return self;
        }

        let n = 1u64 << (feature as u8);
        Self(self.0 | n)
    }

    /// Returns `true` if all of the features in this feature set are found.
    #[inline(always)]
    pub fn detected(self) -> bool {
        if cfg!(feature = "no-rt-feature-detect") {
            return self.0 == 0;
        }

        // Note: Currently LLVM is capable to realize that `setup` strictly dominates
        // `detected` and that `CACHE` is a runtime constant and it treats it as such
        // which means `detected()` gets hoisted and loops using it can get unrolled.
        //
        // This is a hack I was going to do if that wasn't the case. I'm gonna leave
        // it here in case this changes.
        //
        // cfg_match! {
        //     // This is a hack
        //     target_arch = "x86_64" => {
        //         unsafe {
        //             asm!("mov {0:r}, [{1}]",
        //             lateout(reg) x,
        //             sym CACHE,
        //             options(nomem, nostack, pure))
        //         }
        //     }
        //     _ => {
        //         x = unsafe { CACHE }
        //     }
        // }

        let x = CACHE.load(Ordering::Acquire);
        (x & self.0) == self.0
    }
 }

 impl Feature {
    #[inline(always)]
    #[cfg(target_arch = "x86_64")]
    const fn check_cfg(self) -> bool {
        match self {
            Feature::sse2 => cfg!(target_feature = "sse2"),
            Feature::sse3 => cfg!(target_feature = "sse3"),
            Feature::sse4_1 => cfg!(target_feature = "sse4.1"),
            Feature::sse4_2 => cfg!(target_feature = "sse4.2"),
            Feature::popcnt => cfg!(target_feature = "popcnt"),
            Feature::rdrand => cfg!(target_feature = "rdrand"),
            Feature::avx2 => cfg!(target_feature = "avx2"),
            Feature::avx512f => cfg!(target_feature = "avx512f"),
            Feature::avx512bw => cfg!(target_feature = "avx512bw"),
            Feature::avx512vl => cfg!(target_feature = "avx512vl"),
            Feature::bmi2 => cfg!(target_feature = "bmi2"),
        }
    }

    #[inline(always)]
    #[cfg(not(target_arch = "x86_64"))]
    const fn check_cfg(self) -> bool {
        true
    }
 }

 impl Debug for FeatureSet {
    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
        let mut s = f.debug_set();

        #[cfg(target_arch = "x86_64")]
        {
            if self.0 & (1 << (Feature::sse2 as u8)) != 0 {
                s.entry(&Feature::sse2);
            }
            if self.0 & (1 << (Feature::sse3 as u8)) != 0 {
                s.entry(&Feature::sse3);
            }
            if self.0 & (1 << (Feature::sse4_1 as u8)) != 0 {
                s.entry(&Feature::sse4_1);
            }
            if self.0 & (1 << (Feature::sse4_2 as u8)) != 0 {
                s.entry(&Feature::sse4_2);
            }
            if self.0 & (1 << (Feature::popcnt as u8)) != 0 {
                s.entry(&Feature::popcnt);
            }
            if self.0 & (1 << (Feature::rdrand as u8)) != 0 {
                s.entry(&Feature::rdrand);
            }
            if self.0 & (1 << (Feature::avx2 as u8)) != 0 {
                s.entry(&Feature::avx2);
            }
            if self.0 & (1 << (Feature::avx512f as u8)) != 0 {
                s.entry(&Feature::avx512f);
            }
            if self.0 & (1 << (Feature::avx512bw as u8)) != 0 {
                s.entry(&Feature::avx512bw);
            }
            if self.0 & (1 << (Feature::avx512vl as u8)) != 0 {
                s.entry(&Feature::avx512vl);
            }
            if self.0 & (1 << (Feature::bmi2 as u8)) != 0 {
                s.entry(&Feature::bmi2);
            }
        }

        s.finish()
    }
 }

 /// CPU feature-based function dispatch, like GNU ifunc but in pure Rust.
 ///
 /// On first call, detects CPU features and selects the best implementation.
 /// Subsequent calls go directly through a cached function pointer.
 ///
 /// # Example
 ///
 /// ```ignore
 /// cpufn! {
 ///     /// Compute string length with CPU-specific optimizations.
 ///     pub unsafe fn strlen(s: *const u8) -> usize {
 ///         #[cfg(target_arch = "x86_64")]
 ///         [avx512bw, avx512vl] => strlen_avx512,
 ///         #[cfg(target_arch = "x86_64")]
 ///         [avx2] => strlen_avx2,
 ///         _ => strlen_generic,
 ///     }
 /// }
 /// ```
 #[macro_export]
 macro_rules! cpufn {
    // Safe fn variant
    (
        $(#[$fn_attr:meta])*
        $vis:vis fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
            $(
                $(#[$arm_attr:meta])*
                [$($feature:ident),*] => $impl:path,
            )*
            _ => $default:path $(,)?
        }
    ) => {
        $(#[$fn_attr])*
        #[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
        #[inline(always)]
        $vis fn $name( $($arg : $argty),* ) $(-> $ret)? {
            use core::sync::atomic::AtomicPtr;
            use core::sync::atomic::Ordering;

            type Fn = fn( $($argty),* ) $(-> $ret)?;

            static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ());

            #[cold]
            #[inline(never)]
            #[allow(unused)]
            fn resolve( $($arg : $argty),* ) $(-> $ret)? {
                use $crate::sys::detect::Feature;
                use $crate::sys::detect::FeatureSet;

                let f: Fn = 'select: {
                    $(
                        $(#[$arm_attr])*
                        if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
                            break 'select $impl;
                        }
                    )*
                    $default
                };

                F.store(f as *const () as *mut (), Ordering::Release);
                f( $($arg),* )
            }

            let f: Fn = unsafe {
                core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire))
            };
            f( $($arg),* )
        }
    };

    // Unsafe fn variant
    (
        $(#[$fn_attr:meta])*
        $vis:vis unsafe fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
            $(
                $(#[$arm_attr:meta])*
                [$($feature:ident),*] => $impl:path,
            )*
            _ => $default:path $(,)?
        }
    ) => {
        $(#[$fn_attr])*
        #[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
        #[inline(always)]
        $vis unsafe fn $name( $($arg : $argty),* ) $(-> $ret)? {
            use core::sync::atomic::AtomicPtr;
            use core::sync::atomic::Ordering;

            type Fn = unsafe fn( $($argty),* ) $(-> $ret)?;

            static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ());

            #[cold]
            #[inline(never)]
            #[allow(unused)]
            unsafe fn resolve( $($arg : $argty),* ) $(-> $ret)? {
                use $crate::sys::detect::Feature;
                use $crate::sys::detect::FeatureSet;

                let f: Fn = 'select: {
                    $(
                        $(#[$arm_attr])*
                        if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
                            break 'select $impl;
                        }
                    )*
                    $default
                };

                F.store(f as *const () as *mut (), Ordering::Release);
                f( $($arg),* )
            }

            let f: Fn = core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire));
            f( $($arg),* )
        }
    };
 }

 #[cfg(test)]
 mod tests {
    fn add_generic(a: i32, b: i32) -> i32 {
        a + b
    }

    #[cfg(target_arch = "x86_64")]
    fn add_avx2(a: i32, b: i32) -> i32 {
        a + b + 1000 // marker to prove we picked this one
    }

    cpufn! {
        /// Test function for cpufn dispatch.
        fn test_add(a: i32, b: i32) -> i32 {
            #[cfg(target_arch = "x86_64")]
            [avx2] => add_avx2,
            _ => add_generic,
        }
    }

    #[test]
    fn test_cpufn_dispatch() {
        let result = test_add(2, 3);

        #[cfg(target_arch = "x86_64")]
        if super::FeatureSet::new(super::Feature::avx2).detected() {
            assert_eq!(result, 1005); // 2 + 3 + 1000
        } else {
            assert_eq!(result, 5);
        }

        #[cfg(not(target_arch = "x86_64"))]
        assert_eq!(result, 5);
    }
 }
diff --git a/strlen.rs b/strlen.rs
 // © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.

 use core::ffi::c_char;

 use crate::cpufn;

 cpufn! {
    #[unsafe(no_mangle)]
    pub unsafe fn strlen(s: *const c_char) -> usize {
        #[cfg(target_arch = "x86_64")]
        [avx512f, avx512bw] => strlen_avx512,
        #[cfg(target_arch = "x86_64")]
        [avx2] => strlen_avx2,
        _ => strlen_magic,
    }
 }

 unsafe fn strlen_magic(str: *const c_char) -> usize {
    let mut s = str;

    while (s as usize) & 7 != 0 {
        if *s == 0 {
            return s.offset_from_unsigned(str);
        }

        s = s.add(1);
    }

    // Now use the glibc/Sean Eron Anderson trick to scan word at a time.
    // We assume 64-bit systems in Joe's crt.
    let mut w: *const u64 = s as *const _;

    loop {
        let v = *w;

        // Check if the word has a zero in it. It may also have a 0x80 at the high byte.

        let maybe_has_zero: u64 = (v.wrapping_add(0x7efefefefefefeff) ^ !v) & 0x8101010101010100;

        if maybe_has_zero != 0 {
            let ch: *const c_char = w as *const _;

            if *ch == 0 {
                return ch.offset_from_unsigned(str);
            } else if *ch.add(1) == 0 {
                return ch.offset_from_unsigned(str) + 1;
            } else if *ch.add(2) == 0 {
                return ch.offset_from_unsigned(str) + 2;
            } else if *ch.add(3) == 0 {
                return ch.offset_from_unsigned(str) + 3;
            } else if *ch.add(4) == 0 {
                return ch.offset_from_unsigned(str) + 4;
            } else if *ch.add(5) == 0 {
                return ch.offset_from_unsigned(str) + 5;
            } else if *ch.add(6) == 0 {
                return ch.offset_from_unsigned(str) + 6;
            } else if *ch.add(7) == 0 {
                return ch.offset_from_unsigned(str) + 7;
            }
        }

        // load the next word.

        w = w.add(1);
    }

    // TODO: Maybe not a bad idea to do the above loop until ptr is 32-byte aligned and then use
    // the avx code?
 }

 #[cfg(target_arch = "x86_64")]
 unsafe fn strlen_avx2(str: *const c_char) -> usize {
    let out: usize;
    core::arch::asm!(
        include_str!("./strlen_x86_64_avx2.s"),
        in("rdi") str,
        lateout("rax") out,
        lateout("ymm0") _,
        lateout("ymm1") _,
        lateout("ymm2") _,
        lateout("rdi") _,
        lateout("rdx") _,
        lateout("rcx") _,
        options(nostack, readonly)
    );
    out
 }

 #[cfg(target_arch = "x86_64")]
 unsafe fn strlen_avx512(str: *const c_char) -> usize {
    let out: usize;
    core::arch::asm!(
        include_str!("./strlen_x86_64_avx512.s"),
        in("rdi") str,
        lateout("rax") out,
        lateout("zmm0") _,
        lateout("zmm1") _,
        lateout("zmm2") _,
        lateout("rdi") _,
        lateout("rdx") _,
        lateout("rcx") _,
        lateout("k1") _,
        lateout("k2") _,
        options(nostack, readonly)
    );
    out
 }

 #[cfg(bench)]
 #[cfg(target_arch = "x86_64")]
 mod bench {
    use super::*;
    use crate::selftest::micro::*;

    #[bench]
    fn strlen_bench() {
        use crate::sys::detect::Feature;
        use crate::sys::detect::FeatureSet;

        let mut buffer = [1; 1024 * 8];
        buffer[buffer.len() - 1] = 0;
        let mut size = 1;

        const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);
        const AVX512: FeatureSet = FeatureSet::EMPTY
            .add(Feature::avx512f)
            .add(Feature::avx512bw);

        struct Input {
            len: usize,
            ptr: *const i8,
        }

        let input = empty::<Input>()
            .set_throughput("byte", |v| v.len)
            .chain_with(move || {
                buffer[size - 1] = 32;
                size *= 2;

                if size <= buffer.len() {
                    buffer[size - 1] = 0;
                    Some(Input {
                        len: size - 1,
                        ptr: buffer.as_ptr(),
                    })
                } else {
                    None
                }
            });

        let fns = &[
            BenchFn::new("avx2", |input: &Input| unsafe { strlen_avx2(input.ptr) })
                .skip_if(!AVX2.detected()),
            BenchFn::new("avx512", |input: &Input| unsafe {
                strlen_avx512(input.ptr)
            })
            .skip_if(!AVX512.detected()),
            BenchFn::new("magic", |input| unsafe { strlen_magic(input.ptr) }),
        ];

        BenchConfig::default().set_name("strlen").run(input, fns);
    }
 }

 #[cfg(test)]
 mod test {
    use core::ops::Deref;
    use core::ops::DerefMut;

    use super::*;

    const CAP: usize = 1024 * 5;
    #[repr(align(512))]
    struct Buffer([u8; CAP]);
    impl Deref for Buffer {
        type Target = [u8];
        fn deref(&self) -> &Self::Target {
            &self.0
        }
    }
    impl DerefMut for Buffer {
        fn deref_mut(&mut self) -> &mut Self::Target {
            &mut self.0
        }
    }

    #[inline(never)]
    fn test_strlen_impl(name: &str, f: impl Fn(*const c_char) -> usize) {
        let mut alloc = Buffer([0; CAP]);
        assert_eq!((alloc.0.as_ptr() as usize) % 64, 0);

        let test_cases = [
            257, 511, 512, 513, 1023, 1024, 1025, 2047, 2048, 2049, 4095, 4096, 4097, 4098, 5000,
        ];

        for align_offset in 0..=512 {
            for len in (0..=256).chain(test_cases) {
                if align_offset + len + 1 >= alloc.len() {
                    continue; // Skip out-of-bound cases
                }

                unsafe {
                    assert_eq!(
                        (alloc.0.as_ptr().add(align_offset) as usize) % 64,
                        align_offset % 64,
                        "failed for align={align_offset}"
                    );
                }

                // Fill with non-zero data
                for b in alloc.iter_mut() {
                    *b = 1;
                }

                // Set the bytes before the input to zero, this tests implementations that
                // load an entire aligned chunk of data in initial step.
                for i in 0..align_offset {
                    alloc[i] = 0;
                }

                // Set null terminator at the correct location
                alloc[align_offset + len] = 0;
                alloc[align_offset + len + 1] = 0;

                // Pointer to the beginning of the test string
                let s = unsafe { alloc.as_ptr().add(align_offset) as *const c_char };

                let result = f(s);

                assert_eq!(
                    result,
                    len,
                    "Failed for align={} align_offset={}, len={}, got={}",
                    unsafe { (alloc.0.as_ptr().add(align_offset) as usize) % 512 },
                    align_offset,
                    len,
                    result
                );
            }
        }

        for i in 0..512 {
            alloc[i] = 0;
            let s = unsafe { alloc.as_ptr().add(i) as *const c_char };
            let result = f(s);
            assert_eq!(
                result, 0,
                "{name}: Failed zero-terminator check at offset {}: got {}",
                i, result
            );
        }
    }

    #[test]
    fn test_strlen_magic() {
        test_strlen_impl("magic", |x| unsafe { strlen_magic(x) });
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn test_strlen_x86_avx2() {
        use crate::sys::detect::Feature;
        use crate::sys::detect::FeatureSet;

        const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);

        if !AVX2.detected() {
            return crate::selftest::ignore();
        }

        test_strlen_impl("x86_64-avx2", |x| unsafe { strlen_avx2(x) });
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn test_strlen_x86_avx512() {
        use crate::sys::detect::Feature;
        use crate::sys::detect::FeatureSet;

        const AVX512: FeatureSet = FeatureSet::EMPTY
            .add(Feature::avx512f)
            .add(Feature::avx512bw);
        if !AVX512.detected() {
            return crate::selftest::ignore();
        }

        test_strlen_impl("x86_64-avx512", |x| unsafe { strlen_avx512(x) });
    }

    #[test]
    fn test_strlen_default() {
        test_strlen_impl("default", |x| unsafe { strlen(x) });
    }
 }
diff --git a/strlen_x86_64_avx2.s b/strlen_x86_64_avx2.s
 # Registers used:
 # ymm0  -> constant zero
 # ymm1  -> loaded value AND result of compare against zero
 # ymm2  -> loaded value AND result of compare against zero
 #
 # rax   -> the original input ptr AND the return value
 # rdi   -> our 32-byte aligned pointer which moves forward
 # 
 # edx   -> the finalized simd mask AND tzcnt of the mask
 # ecx   -> alignment padding = original_ptr % 32

 # Keep the original input in rax we don't need it until computing the return value.
 mov         rax, rdi
 # n = rcx, ecx = (original ptr) & 31 ; our initial alignment padding.
 mov         rcx, rdi
 and         rcx, 31
 # ymm0 = ZERO
 vpxor       ymm0, ymm0, ymm0
 # If already 32-byte aligned skip the partial step
 test        rcx, rcx
 jz          20f
 # align the ptr to 32-byte boundary
 and         rdi, -32
 # mask = to_mask(cmp(*ptr, ZERO))
 vmovdqa     ymm1, [rdi]
 vpcmpeqb    ymm1, ymm1, ymm0
 vpmovmskb   edx, ymm1
 # now we have the first 32-bytes in the memory but this may include some other
 # bytes that appear before the `str`. So we need to clear some of the LSBs
 # mask >>= rcx
 shr         edx, cl
 # if the mask is zero then go to the loop there were no zeros here
 test        edx, edx
 jz          21f
 # there is at least one bit set so we found our zero now we have to calculate the length.
 tzcnt       eax, edx
 jmp         90f
 # if we're here the first 32-byte did not contain a zero.
 # and rdi is 32-byte aligned.
 21:
  add         rdi, 32
 # ====================== 32-byte aligned
 20:
  test        rdi, 63
  jz          30f
  # mask = to_mask(cmp(*ptr, ZERO))
  vmovdqa     ymm1, [rdi]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpmovmskb   edx, ymm1
  test        edx, edx
  jnz         70f
  add         rdi, 32
 # ===================== 64-byte aligned
 30:
  test        rdi, 127
  jz          40f
  vmovdqa     ymm1, [rdi]
  vmovdqa     ymm2, [rdi + 32]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpcmpeqb    ymm2, ymm2, ymm0
  vpmovmskb   edx, ymm1
  vpmovmskb   ecx, ymm2
  or          edx, ecx
  jnz         60f
  add         rdi, 64
 # ===================== 128-byte aligned
 40:
  # Chunk 1 or 2
  vmovdqa     ymm1, [rdi]
  vmovdqa     ymm2, [rdi + 32]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpcmpeqb    ymm2, ymm2, ymm0
  vpmovmskb   edx, ymm1
  vpmovmskb   ecx, ymm2
  or          edx, ecx
  jnz         60f
  # Chunk 3 or 4
  vmovdqa     ymm1, [rdi + 64]
  vmovdqa     ymm2, [rdi + 96]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpcmpeqb    ymm2, ymm2, ymm0
  vpmovmskb   edx, ymm1
  vpmovmskb   ecx, ymm2
  or          edx, ecx
  jnz         50f
 45:
  add         rdi, 128
  jmp         40b
 # ================================== END
 # ==== Found in ymm1 or ymm2 in chunk 3 or 4
 50:
  # Re-extract masks since we lost them in the 'or'
  vmovdqa     ymm1, [rdi + 64]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpmovmskb   edx, ymm1
  test        edx, edx
  jz          52f
  # Found in chunk 3
  add         rdi, 64
  jmp         70f
  # Found in chunk 4
  52:
  vmovdqa     ymm2, [rdi + 96]
  vpcmpeqb    ymm2, ymm2, ymm0
  vpmovmskb   edx, ymm2
  tzcnt       edx, edx
  add         rdi, 96
  jmp         80f
 # ==== Found in ymm1 or ymm2 in chunk 1 or 2
 60:
  # Re-extract masks since we lost them in the 'or'
  vmovdqa     ymm1, [rdi]
  vpcmpeqb    ymm1, ymm1, ymm0
  vpmovmskb   edx, ymm1
  test        edx, edx
  jz          62f
  # Found in chunk 1
  jmp         70f
  # Found in chunk 2
  62:
  vmovdqa     ymm2, [rdi + 32]
  vpcmpeqb    ymm2, ymm2, ymm0
  vpmovmskb   edx, ymm2
  tzcnt       edx, edx
  add         rdi, 32
  jmp         80f
 # ==== Found in ymm1 in chunk 1
 70:
  tzcnt       edx, edx
 # length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
 80:
  sub         rdi, rax
  add         rdi, rdx
  mov         rax, rdi
 90:
  vzeroupper
diff --git a/strlen_x86_64_avx512.s b/strlen_x86_64_avx512.s
 # Registers used:
 # zmm0  -> constant zero
 # zmm1  -> loaded value AND result of compare against zero
 # zmm2  -> loaded value AND result of compare against zero
 #
 # rax   -> the original input ptr AND the return value
 # rdi   -> our 64-byte aligned pointer which moves forward
 # 
 # edx   -> the finalized simd mask AND tzcnt of the mask
 # ecx   -> alignment padding = original_ptr % 64

 # Keep the original input in rax we don't need it until computing the return value.
 mov         rax, rdi
 # n = rcx, ecx = (original ptr) & 63 ; our initial alignment padding.
 mov         rcx, rdi
 and         rcx, 63
 # zmm0 = ZERO
 vpxord      zmm0, zmm0, zmm0

 # If already 64-byte aligned skip the partial step, which could lead us
 # to the 256-byte aligned sooner.
 test        rcx, rcx
 jz          20f

 # TODO: This step could be improved to avoid loeading too much
 # garbage data.
 # align the ptr to 64-byte boundary
 and         rdi, -64
 # mask = to_mask(cmp(*ptr, ZERO))
 vmovdqa32   zmm1, [rdi]
 vpcmpb      k1, zmm1, zmm0, 0
 kmovq       rdx, k1
 # now we have the first 64-bytes in the memory but this may include some other
 # bytes that appear before the `str`. So we need to clear some of the LSBs
 # mask >>= rcx
 shr         rdx, cl
 # if the mask is zero then go to the loop there were no zeros here
 test        rdx, rdx
 # NOTE we don't do runtime target detection for tzcnt, and if not supported it decodes
 # to `bsr` and bsr and tzcnt are the same if the input is not zero.
 # so we do this explicit check against zero here in this branch before jumping to the
 # tzcnt.
 jz          21f
 # there is at least one bit set so we found our zero now we have to calculate the length.
 # we don't need the mask anymore just need to know how many trailing zeros it has.
 tzcnt       rax, rdx
 jmp         90f

 # if we're here the first 64-byte did not contain a zero.
 # and rdi is 64-byte aligned.

 21:
  add         rdi, 64

 # ====================== 64-byte aligned
 20:
  test        rdi, 127
  jz          30f
  # mask = to_mask(cmp(*ptr, ZERO))
  vmovdqa32   zmm1, [rdi]
  vpcmpb      k1, zmm1, zmm0, 0
  kortestq    k1, k1
  jnz         70f
  add         rdi, 64

 # ===================== 128-byte aligned
 30:
  test        rdi, 255
  jz          40f
  vmovdqa32   zmm1, [rdi]
  vmovdqa32   zmm2, [rdi + 64]
  vpcmpb      k1, zmm1, zmm0, 0
  vpcmpb      k2, zmm2, zmm0, 0
  kortestq    k1, k2
  jnz         60f
  add         rdi, 128

 # ===================== 256-byte aligned
 40:
  # Chunk 1 or 2
  vmovdqa32   zmm1, [rdi]
  vmovdqa32   zmm2, [rdi + 64]
  vpcmpb      k1, zmm1, zmm0, 0
  vpcmpb      k2, zmm2, zmm0, 0
  kortestq    k1, k2
  jnz         60f
  # Chunk 3 or 4
  vmovdqa32   zmm1, [rdi + 128]
  vmovdqa32   zmm2, [rdi + 192]
  vpcmpb      k1, zmm1, zmm0, 0
  vpcmpb      k2, zmm2, zmm0, 0
  kortestq    k1, k2
  jnz         50f

 45:
  add         rdi, 256
  jmp         40b

 # ================================== END
 # ==== Found in k1 or k2 in chunk 3 or 4
 50:
  kortestq    k1, k1
  jz          52f
  # Found in chunk 3
  add         rdi, 128
  jmp         70f
  # Found in chunk 4
  52:
  kmovq       rdx, k2
  tzcnt       rdx, rdx
  add         rdi, 192
  jmp         80f

 # ==== Found in k1 or k2 in chunk 1 or 2
 60:
  kortestq    k1, k1
  jz          62f
  # Found in chunk 1
  jmp         70f
  # Found in chunk 2
  62:
  kmovq       rdx, k2
  tzcnt       rdx, rdx
  add         rdi, 64
  jmp         80f

 # ==== Found in k1 in chunk 1
 70:
  kmovq       rdx, k1
  tzcnt       rdx, rdx

 # length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
 80:
  sub         rdi, rax
  add         rdi, rdx
  mov         rax, rdi

 90:
  vzeroupper
	// © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.

	//! Runtime CPU feature detection. The `setup` function from this code must be called at the very
	//! start of the process initialization.

	use core::fmt::Debug;
	use core::fmt::Formatter;
	use core::sync::atomic::AtomicU64;
	use core::sync::atomic::Ordering;

	static CACHE: AtomicU64 = AtomicU64::new(0);

	#[cfg(target_arch = "x86_64")]
	pub(super) fn setup() {
	use core::arch::x86_64::__cpuid;
	use core::arch::x86_64::__cpuid_count;
	use core::arch::x86_64::_xgetbv;
	use core::arch::x86_64::CpuidResult;

	#[inline]
	fn maybe(out: &mut u64, f: Feature, reg: u32, bit: u8) {
	if reg & (1 << bit) != 0 {
	*out \|= 1 << (f as u8);
	}
	}

	let max_basic_leaf = unsafe { __cpuid(0) }.eax;

	if max_basic_leaf < 1 {
	return;
	}

	let mut out = 0u64;

	// CPUID leaf 1: processor info and feature bits
	let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(1) };

	// EDX features
	maybe(&mut out, Feature::sse2, edx, 26);

	// ECX features
	maybe(&mut out, Feature::sse3, ecx, 0);
	maybe(&mut out, Feature::sse4_1, ecx, 19);
	maybe(&mut out, Feature::sse4_2, ecx, 20);
	maybe(&mut out, Feature::popcnt, ecx, 23);
	maybe(&mut out, Feature::rdrand, ecx, 30);

	// For AVX/AVX2/AVX512, we need:
	// 1. OSXSAVE (ECX bit 27) - OS has enabled XSAVE
	// 2. XCR0 bits to verify OS saves the relevant state
	let osxsave = ecx & (1 << 27) != 0;

	if osxsave && max_basic_leaf >= 7 {
	// XCR0 register via XGETBV
	let xcr0 = unsafe { _xgetbv(0) };

	// XCR0[2:1] = 11b means OS saves XMM and YMM state (required for AVX)
	let avx_os_support = (xcr0 & 0b110) == 0b110;

	// XCR0[7:5] = 111b means OS saves opmask, ZMM_Hi256, Hi16_ZMM (required for AVX512)
	let avx512_os_support = avx_os_support && (xcr0 & 0b1110_0000) == 0b1110_0000;

	let CpuidResult { ebx, .. } = unsafe { __cpuid_count(7, 0) };

	maybe(&mut out, Feature::bmi2, ebx, 8);

	if avx_os_support {
	maybe(&mut out, Feature::avx2, ebx, 5);
	}

	if avx512_os_support {
	maybe(&mut out, Feature::avx512f, ebx, 16);
	maybe(&mut out, Feature::avx512bw, ebx, 30);
	maybe(&mut out, Feature::avx512vl, ebx, 31);
	}
	}

	CACHE.store(out, Ordering::Release);
	}

	#[cfg(not(target_arch = "x86_64"))]
	pub fn setup() {
	// NOOP for now.
	}

	#[cfg(target_arch = "x86_64")]
	#[derive(Copy, Clone, Debug)]
	#[allow(non_camel_case_types)]
	pub enum Feature {
	sse2,
	sse3,
	sse4_1,
	sse4_2,
	popcnt,
	rdrand,
	avx2,
	avx512f,
	avx512bw,
	avx512vl,
	bmi2,
	}

	#[derive(Copy, Clone, Debug)]
	#[cfg(not(target_arch = "x86_64"))]
	pub enum Feature {}

	/// A set of features that can be detected either during compile time or runtime.
	///
	/// Note: Try to construct your [`FeatureSet`] as a `const MY_FEATURE_SET: ...` first and then use
	/// `.detected()` on it, this will help the compiler with more aggressive dead code elimination in
	/// case all of the features are known during compile time.
	#[derive(Copy, Clone)]
	pub struct FeatureSet(u64);

	impl FeatureSet {
	pub const EMPTY: Self = Self(0);

	/// Returns the set of detected features.
	pub fn active() -> Self {
	Self(CACHE.load(Ordering::Acquire))
	}

	/// Returns the set of features that were active during compile time.
	pub const fn compile_time() -> Self {
	const fn visit(n: u64, feature: Feature) -> u64 {
	if feature.check_cfg() {
	n \| (1 << (feature as u8))
	} else {
	n
	}
	}

	#[cfg(target_arch = "x86_64")]
	let n = {
	let mut n: u64 = 0;
	n = visit(n, Feature::sse2);
	n = visit(n, Feature::sse3);
	n = visit(n, Feature::sse4_1);
	n = visit(n, Feature::sse4_2);
	n = visit(n, Feature::popcnt);
	n = visit(n, Feature::rdrand);
	n = visit(n, Feature::avx2);
	n = visit(n, Feature::avx512f);
	n = visit(n, Feature::avx512bw);
	n = visit(n, Feature::avx512vl);
	n
	};

	#[cfg(not(target_arch = "x86_64"))]
	let n = 0u64;

	Self(n)
	}

	#[inline(always)]
	pub const fn new(feature: Feature) -> Self {
	Self::EMPTY.add(feature)
	}

	#[inline(always)]
	pub const fn add(self, feature: Feature) -> Self {
	// Avoid inserting the features we know are present at compile time
	// into the feature set. `detected` for an empty feature set will
	// always return true and inlines into:
	//
	// ```plain
	// let x = unsafe { CACHE };
	// (x & 0) == 0
	// ```
	//
	// And the compiler can trivially see that this will be true for any
	// value in the CACHE which will basically be replaced by a constant
	// and nice things will have and branches will be eliminated.
	//
	// Which is really nice for `-Ctarget-cpu=native` builds.
	if feature.check_cfg() {
	return self;
	}

	let n = 1u64 << (feature as u8);
	Self(self.0 \| n)
	}

	/// Returns `true` if all of the features in this feature set are found.
	#[inline(always)]
	pub fn detected(self) -> bool {
	if cfg!(feature = "no-rt-feature-detect") {
	return self.0 == 0;
	}

	// Note: Currently LLVM is capable to realize that `setup` strictly dominates
	// `detected` and that `CACHE` is a runtime constant and it treats it as such
	// which means `detected()` gets hoisted and loops using it can get unrolled.
	//
	// This is a hack I was going to do if that wasn't the case. I'm gonna leave
	// it here in case this changes.
	//
	// cfg_match! {
	// // This is a hack
	// target_arch = "x86_64" => {
	// unsafe {
	// asm!("mov {0:r}, [{1}]",
	// lateout(reg) x,
	// sym CACHE,
	// options(nomem, nostack, pure))
	// }
	// }
	// _ => {
	// x = unsafe { CACHE }
	// }
	// }

	let x = CACHE.load(Ordering::Acquire);
	(x & self.0) == self.0
	}
	}

	impl Feature {
	#[inline(always)]
	#[cfg(target_arch = "x86_64")]
	const fn check_cfg(self) -> bool {
	match self {
	Feature::sse2 => cfg!(target_feature = "sse2"),
	Feature::sse3 => cfg!(target_feature = "sse3"),
	Feature::sse4_1 => cfg!(target_feature = "sse4.1"),
	Feature::sse4_2 => cfg!(target_feature = "sse4.2"),
	Feature::popcnt => cfg!(target_feature = "popcnt"),
	Feature::rdrand => cfg!(target_feature = "rdrand"),
	Feature::avx2 => cfg!(target_feature = "avx2"),
	Feature::avx512f => cfg!(target_feature = "avx512f"),
	Feature::avx512bw => cfg!(target_feature = "avx512bw"),
	Feature::avx512vl => cfg!(target_feature = "avx512vl"),
	Feature::bmi2 => cfg!(target_feature = "bmi2"),
	}
	}

	#[inline(always)]
	#[cfg(not(target_arch = "x86_64"))]
	const fn check_cfg(self) -> bool {
	true
	}
	}

	impl Debug for FeatureSet {
	fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
	let mut s = f.debug_set();

	#[cfg(target_arch = "x86_64")]
	{
	if self.0 & (1 << (Feature::sse2 as u8)) != 0 {
	s.entry(&Feature::sse2);
	}
	if self.0 & (1 << (Feature::sse3 as u8)) != 0 {
	s.entry(&Feature::sse3);
	}
	if self.0 & (1 << (Feature::sse4_1 as u8)) != 0 {
	s.entry(&Feature::sse4_1);
	}
	if self.0 & (1 << (Feature::sse4_2 as u8)) != 0 {
	s.entry(&Feature::sse4_2);
	}
	if self.0 & (1 << (Feature::popcnt as u8)) != 0 {
	s.entry(&Feature::popcnt);
	}
	if self.0 & (1 << (Feature::rdrand as u8)) != 0 {
	s.entry(&Feature::rdrand);
	}
	if self.0 & (1 << (Feature::avx2 as u8)) != 0 {
	s.entry(&Feature::avx2);
	}
	if self.0 & (1 << (Feature::avx512f as u8)) != 0 {
	s.entry(&Feature::avx512f);
	}
	if self.0 & (1 << (Feature::avx512bw as u8)) != 0 {
	s.entry(&Feature::avx512bw);
	}
	if self.0 & (1 << (Feature::avx512vl as u8)) != 0 {
	s.entry(&Feature::avx512vl);
	}
	if self.0 & (1 << (Feature::bmi2 as u8)) != 0 {
	s.entry(&Feature::bmi2);
	}
	}

	s.finish()
	}
	}

	/// CPU feature-based function dispatch, like GNU ifunc but in pure Rust.
	///
	/// On first call, detects CPU features and selects the best implementation.
	/// Subsequent calls go directly through a cached function pointer.
	///
	/// # Example
	///
	/// ```ignore
	/// cpufn! {
	/// /// Compute string length with CPU-specific optimizations.
	/// pub unsafe fn strlen(s: *const u8) -> usize {
	/// #[cfg(target_arch = "x86_64")]
	/// [avx512bw, avx512vl] => strlen_avx512,
	/// #[cfg(target_arch = "x86_64")]
	/// [avx2] => strlen_avx2,
	/// _ => strlen_generic,
	/// }
	/// }
	/// ```
	#[macro_export]
	macro_rules! cpufn {
	// Safe fn variant
	(
	$(#[$fn_attr:meta])*
	$vis:vis fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
	$(
	$(#[$arm_attr:meta])*
	[$($feature:ident),*] => $impl:path,
	)*
	_ => $default:path $(,)?
	}
	) => {
	$(#[$fn_attr])*
	#[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
	#[inline(always)]
	$vis fn $name( $($arg : $argty),* ) $(-> $ret)? {
	use core::sync::atomic::AtomicPtr;
	use core::sync::atomic::Ordering;

	type Fn = fn( $($argty),* ) $(-> $ret)?;

	static F: AtomicPtr<()> = AtomicPtr::new(resolve as const () as mut ());

	#[cold]
	#[inline(never)]
	#[allow(unused)]
	fn resolve( $($arg : $argty),* ) $(-> $ret)? {
	use $crate::sys::detect::Feature;
	use $crate::sys::detect::FeatureSet;

	let f: Fn = 'select: {
	$(
	$(#[$arm_attr])*
	if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
	break 'select $impl;
	}
	)*
	$default
	};

	F.store(f as const () as mut (), Ordering::Release);
	f( $($arg),* )
	}

	let f: Fn = unsafe {
	core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire))
	};
	f( $($arg),* )
	}
	};

	// Unsafe fn variant
	(
	$(#[$fn_attr:meta])*
	$vis:vis unsafe fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? {
	$(
	$(#[$arm_attr:meta])*
	[$($feature:ident),*] => $impl:path,
	)*
	_ => $default:path $(,)?
	}
	) => {
	$(#[$fn_attr])*
	#[allow(unused_attributes)] // #[inline] ignored on #[no_mangle]
	#[inline(always)]
	$vis unsafe fn $name( $($arg : $argty),* ) $(-> $ret)? {
	use core::sync::atomic::AtomicPtr;
	use core::sync::atomic::Ordering;

	type Fn = unsafe fn( $($argty),* ) $(-> $ret)?;

	static F: AtomicPtr<()> = AtomicPtr::new(resolve as const () as mut ());

	#[cold]
	#[inline(never)]
	#[allow(unused)]
	unsafe fn resolve( $($arg : $argty),* ) $(-> $ret)? {
	use $crate::sys::detect::Feature;
	use $crate::sys::detect::FeatureSet;

	let f: Fn = 'select: {
	$(
	$(#[$arm_attr])*
	if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() {
	break 'select $impl;
	}
	)*
	$default
	};

	F.store(f as const () as mut (), Ordering::Release);
	f( $($arg),* )
	}

	let f: Fn = core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire));
	f( $($arg),* )
	}
	};
	}

	#[cfg(test)]
	mod tests {
	fn add_generic(a: i32, b: i32) -> i32 {
	a + b
	}

	#[cfg(target_arch = "x86_64")]
	fn add_avx2(a: i32, b: i32) -> i32 {
	a + b + 1000 // marker to prove we picked this one
	}

	cpufn! {
	/// Test function for cpufn dispatch.
	fn test_add(a: i32, b: i32) -> i32 {
	#[cfg(target_arch = "x86_64")]
	[avx2] => add_avx2,
	_ => add_generic,
	}
	}

	#[test]
	fn test_cpufn_dispatch() {
	let result = test_add(2, 3);

	#[cfg(target_arch = "x86_64")]
	if super::FeatureSet::new(super::Feature::avx2).detected() {
	assert_eq!(result, 1005); // 2 + 3 + 1000
	} else {
	assert_eq!(result, 5);
	}

	#[cfg(not(target_arch = "x86_64"))]
	assert_eq!(result, 5);
	}
	}
	// © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License.

	use core::ffi::c_char;

	use crate::cpufn;

	cpufn! {
	#[unsafe(no_mangle)]
	pub unsafe fn strlen(s: *const c_char) -> usize {
	#[cfg(target_arch = "x86_64")]
	[avx512f, avx512bw] => strlen_avx512,
	#[cfg(target_arch = "x86_64")]
	[avx2] => strlen_avx2,
	_ => strlen_magic,
	}
	}

	unsafe fn strlen_magic(str: *const c_char) -> usize {
	let mut s = str;

	while (s as usize) & 7 != 0 {
	if *s == 0 {
	return s.offset_from_unsigned(str);
	}

	s = s.add(1);
	}

	// Now use the glibc/Sean Eron Anderson trick to scan word at a time.
	// We assume 64-bit systems in Joe's crt.
	let mut w: const u64 = s as const _;

	loop {
	let v = *w;

	// Check if the word has a zero in it. It may also have a 0x80 at the high byte.

	let maybe_has_zero: u64 = (v.wrapping_add(0x7efefefefefefeff) ^ !v) & 0x8101010101010100;

	if maybe_has_zero != 0 {
	let ch: const c_char = w as const _;

	if *ch == 0 {
	return ch.offset_from_unsigned(str);
	} else if *ch.add(1) == 0 {
	return ch.offset_from_unsigned(str) + 1;
	} else if *ch.add(2) == 0 {
	return ch.offset_from_unsigned(str) + 2;
	} else if *ch.add(3) == 0 {
	return ch.offset_from_unsigned(str) + 3;
	} else if *ch.add(4) == 0 {
	return ch.offset_from_unsigned(str) + 4;
	} else if *ch.add(5) == 0 {
	return ch.offset_from_unsigned(str) + 5;
	} else if *ch.add(6) == 0 {
	return ch.offset_from_unsigned(str) + 6;
	} else if *ch.add(7) == 0 {
	return ch.offset_from_unsigned(str) + 7;
	}
	}

	// load the next word.

	w = w.add(1);
	}

	// TODO: Maybe not a bad idea to do the above loop until ptr is 32-byte aligned and then use
	// the avx code?
	}

	#[cfg(target_arch = "x86_64")]
	unsafe fn strlen_avx2(str: *const c_char) -> usize {
	let out: usize;
	core::arch::asm!(
	include_str!("./strlen_x86_64_avx2.s"),
	in("rdi") str,
	lateout("rax") out,
	lateout("ymm0") _,
	lateout("ymm1") _,
	lateout("ymm2") _,
	lateout("rdi") _,
	lateout("rdx") _,
	lateout("rcx") _,
	options(nostack, readonly)
	);
	out
	}

	#[cfg(target_arch = "x86_64")]
	unsafe fn strlen_avx512(str: *const c_char) -> usize {
	let out: usize;
	core::arch::asm!(
	include_str!("./strlen_x86_64_avx512.s"),
	in("rdi") str,
	lateout("rax") out,
	lateout("zmm0") _,
	lateout("zmm1") _,
	lateout("zmm2") _,
	lateout("rdi") _,
	lateout("rdx") _,
	lateout("rcx") _,
	lateout("k1") _,
	lateout("k2") _,
	options(nostack, readonly)
	);
	out
	}

	#[cfg(bench)]
	#[cfg(target_arch = "x86_64")]
	mod bench {
	use super::*;
	use crate::selftest::micro::*;

	#[bench]
	fn strlen_bench() {
	use crate::sys::detect::Feature;
	use crate::sys::detect::FeatureSet;

	let mut buffer = [1; 1024 * 8];
	buffer[buffer.len() - 1] = 0;
	let mut size = 1;

	const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);
	const AVX512: FeatureSet = FeatureSet::EMPTY
	.add(Feature::avx512f)
	.add(Feature::avx512bw);

	struct Input {
	len: usize,
	ptr: *const i8,
	}

	let input = empty::<Input>()
	.set_throughput("byte", \|v\| v.len)
	.chain_with(move \|\| {
	buffer[size - 1] = 32;
	size *= 2;

	if size <= buffer.len() {
	buffer[size - 1] = 0;
	Some(Input {
	len: size - 1,
	ptr: buffer.as_ptr(),
	})
	} else {
	None
	}
	});

	let fns = &[
	BenchFn::new("avx2", \|input: &Input\| unsafe { strlen_avx2(input.ptr) })
	.skip_if(!AVX2.detected()),
	BenchFn::new("avx512", \|input: &Input\| unsafe {
	strlen_avx512(input.ptr)
	})
	.skip_if(!AVX512.detected()),
	BenchFn::new("magic", \|input\| unsafe { strlen_magic(input.ptr) }),
	];

	BenchConfig::default().set_name("strlen").run(input, fns);
	}
	}

	#[cfg(test)]
	mod test {
	use core::ops::Deref;
	use core::ops::DerefMut;

	use super::*;

	const CAP: usize = 1024 * 5;
	#[repr(align(512))]
	struct Buffer([u8; CAP]);
	impl Deref for Buffer {
	type Target = [u8];
	fn deref(&self) -> &Self::Target {
	&self.0
	}
	}
	impl DerefMut for Buffer {
	fn deref_mut(&mut self) -> &mut Self::Target {
	&mut self.0
	}
	}

	#[inline(never)]
	fn test_strlen_impl(name: &str, f: impl Fn(*const c_char) -> usize) {
	let mut alloc = Buffer([0; CAP]);
	assert_eq!((alloc.0.as_ptr() as usize) % 64, 0);

	let test_cases = [
	257, 511, 512, 513, 1023, 1024, 1025, 2047, 2048, 2049, 4095, 4096, 4097, 4098, 5000,
	];

	for align_offset in 0..=512 {
	for len in (0..=256).chain(test_cases) {
	if align_offset + len + 1 >= alloc.len() {
	continue; // Skip out-of-bound cases
	}

	unsafe {
	assert_eq!(
	(alloc.0.as_ptr().add(align_offset) as usize) % 64,
	align_offset % 64,
	"failed for align={align_offset}"
	);
	}

	// Fill with non-zero data
	for b in alloc.iter_mut() {
	*b = 1;
	}

	// Set the bytes before the input to zero, this tests implementations that
	// load an entire aligned chunk of data in initial step.
	for i in 0..align_offset {
	alloc[i] = 0;
	}

	// Set null terminator at the correct location
	alloc[align_offset + len] = 0;
	alloc[align_offset + len + 1] = 0;

	// Pointer to the beginning of the test string
	let s = unsafe { alloc.as_ptr().add(align_offset) as *const c_char };

	let result = f(s);

	assert_eq!(
	result,
	len,
	"Failed for align={} align_offset={}, len={}, got={}",
	unsafe { (alloc.0.as_ptr().add(align_offset) as usize) % 512 },
	align_offset,
	len,
	result
	);
	}
	}

	for i in 0..512 {
	alloc[i] = 0;
	let s = unsafe { alloc.as_ptr().add(i) as *const c_char };
	let result = f(s);
	assert_eq!(
	result, 0,
	"{name}: Failed zero-terminator check at offset {}: got {}",
	i, result
	);
	}
	}

	#[test]
	fn test_strlen_magic() {
	test_strlen_impl("magic", \|x\| unsafe { strlen_magic(x) });
	}

	#[test]
	#[cfg(target_arch = "x86_64")]
	fn test_strlen_x86_avx2() {
	use crate::sys::detect::Feature;
	use crate::sys::detect::FeatureSet;

	const AVX2: FeatureSet = FeatureSet::new(Feature::avx2);

	if !AVX2.detected() {
	return crate::selftest::ignore();
	}

	test_strlen_impl("x86_64-avx2", \|x\| unsafe { strlen_avx2(x) });
	}

	#[test]
	#[cfg(target_arch = "x86_64")]
	fn test_strlen_x86_avx512() {
	use crate::sys::detect::Feature;
	use crate::sys::detect::FeatureSet;

	const AVX512: FeatureSet = FeatureSet::EMPTY
	.add(Feature::avx512f)
	.add(Feature::avx512bw);
	if !AVX512.detected() {
	return crate::selftest::ignore();
	}

	test_strlen_impl("x86_64-avx512", \|x\| unsafe { strlen_avx512(x) });
	}

	#[test]
	fn test_strlen_default() {
	test_strlen_impl("default", \|x\| unsafe { strlen(x) });
	}
	}
	# Registers used:
	# ymm0 -> constant zero
	# ymm1 -> loaded value AND result of compare against zero
	# ymm2 -> loaded value AND result of compare against zero
	#
	# rax -> the original input ptr AND the return value
	# rdi -> our 32-byte aligned pointer which moves forward
	#
	# edx -> the finalized simd mask AND tzcnt of the mask
	# ecx -> alignment padding = original_ptr % 32

	# Keep the original input in rax we don't need it until computing the return value.
	mov rax, rdi
	# n = rcx, ecx = (original ptr) & 31 ; our initial alignment padding.
	mov rcx, rdi
	and rcx, 31
	# ymm0 = ZERO
	vpxor ymm0, ymm0, ymm0
	# If already 32-byte aligned skip the partial step
	test rcx, rcx
	jz 20f
	# align the ptr to 32-byte boundary
	and rdi, -32
	# mask = to_mask(cmp(*ptr, ZERO))
	vmovdqa ymm1, [rdi]
	vpcmpeqb ymm1, ymm1, ymm0
	vpmovmskb edx, ymm1
	# now we have the first 32-bytes in the memory but this may include some other
	# bytes that appear before the `str`. So we need to clear some of the LSBs
	# mask >>= rcx
	shr edx, cl
	# if the mask is zero then go to the loop there were no zeros here
	test edx, edx
	jz 21f
	# there is at least one bit set so we found our zero now we have to calculate the length.
	tzcnt eax, edx
	jmp 90f
	# if we're here the first 32-byte did not contain a zero.
	# and rdi is 32-byte aligned.
	21:
	add rdi, 32
	# ====================== 32-byte aligned
	20:
	test rdi, 63
	jz 30f
	# mask = to_mask(cmp(*ptr, ZERO))
	vmovdqa ymm1, [rdi]
	vpcmpeqb ymm1, ymm1, ymm0
	vpmovmskb edx, ymm1
	test edx, edx
	jnz 70f
	add rdi, 32
	# ===================== 64-byte aligned
	30:
	test rdi, 127
	jz 40f
	vmovdqa ymm1, [rdi]
	vmovdqa ymm2, [rdi + 32]
	vpcmpeqb ymm1, ymm1, ymm0
	vpcmpeqb ymm2, ymm2, ymm0
	vpmovmskb edx, ymm1
	vpmovmskb ecx, ymm2
	or edx, ecx
	jnz 60f
	add rdi, 64
	# ===================== 128-byte aligned
	40:
	# Chunk 1 or 2
	vmovdqa ymm1, [rdi]
	vmovdqa ymm2, [rdi + 32]
	vpcmpeqb ymm1, ymm1, ymm0
	vpcmpeqb ymm2, ymm2, ymm0
	vpmovmskb edx, ymm1
	vpmovmskb ecx, ymm2
	or edx, ecx
	jnz 60f
	# Chunk 3 or 4
	vmovdqa ymm1, [rdi + 64]
	vmovdqa ymm2, [rdi + 96]
	vpcmpeqb ymm1, ymm1, ymm0
	vpcmpeqb ymm2, ymm2, ymm0
	vpmovmskb edx, ymm1
	vpmovmskb ecx, ymm2
	or edx, ecx
	jnz 50f
	45:
	add rdi, 128
	jmp 40b
	# ================================== END
	# ==== Found in ymm1 or ymm2 in chunk 3 or 4
	50:
	# Re-extract masks since we lost them in the 'or'
	vmovdqa ymm1, [rdi + 64]
	vpcmpeqb ymm1, ymm1, ymm0
	vpmovmskb edx, ymm1
	test edx, edx
	jz 52f
	# Found in chunk 3
	add rdi, 64
	jmp 70f
	# Found in chunk 4
	52:
	vmovdqa ymm2, [rdi + 96]
	vpcmpeqb ymm2, ymm2, ymm0
	vpmovmskb edx, ymm2
	tzcnt edx, edx
	add rdi, 96
	jmp 80f
	# ==== Found in ymm1 or ymm2 in chunk 1 or 2
	60:
	# Re-extract masks since we lost them in the 'or'
	vmovdqa ymm1, [rdi]
	vpcmpeqb ymm1, ymm1, ymm0
	vpmovmskb edx, ymm1
	test edx, edx
	jz 62f
	# Found in chunk 1
	jmp 70f
	# Found in chunk 2
	62:
	vmovdqa ymm2, [rdi + 32]
	vpcmpeqb ymm2, ymm2, ymm0
	vpmovmskb edx, ymm2
	tzcnt edx, edx
	add rdi, 32
	jmp 80f
	# ==== Found in ymm1 in chunk 1
	70:
	tzcnt edx, edx
	# length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
	80:
	sub rdi, rax
	add rdi, rdx
	mov rax, rdi
	90:
	vzeroupper
	# Registers used:
	# zmm0 -> constant zero
	# zmm1 -> loaded value AND result of compare against zero
	# zmm2 -> loaded value AND result of compare against zero
	#
	# rax -> the original input ptr AND the return value
	# rdi -> our 64-byte aligned pointer which moves forward
	#
	# edx -> the finalized simd mask AND tzcnt of the mask
	# ecx -> alignment padding = original_ptr % 64

	# Keep the original input in rax we don't need it until computing the return value.
	mov rax, rdi
	# n = rcx, ecx = (original ptr) & 63 ; our initial alignment padding.
	mov rcx, rdi
	and rcx, 63
	# zmm0 = ZERO
	vpxord zmm0, zmm0, zmm0

	# If already 64-byte aligned skip the partial step, which could lead us
	# to the 256-byte aligned sooner.
	test rcx, rcx
	jz 20f

	# TODO: This step could be improved to avoid loeading too much
	# garbage data.
	# align the ptr to 64-byte boundary
	and rdi, -64
	# mask = to_mask(cmp(*ptr, ZERO))
	vmovdqa32 zmm1, [rdi]
	vpcmpb k1, zmm1, zmm0, 0
	kmovq rdx, k1
	# now we have the first 64-bytes in the memory but this may include some other
	# bytes that appear before the `str`. So we need to clear some of the LSBs
	# mask >>= rcx
	shr rdx, cl
	# if the mask is zero then go to the loop there were no zeros here
	test rdx, rdx
	# NOTE we don't do runtime target detection for tzcnt, and if not supported it decodes
	# to `bsr` and bsr and tzcnt are the same if the input is not zero.
	# so we do this explicit check against zero here in this branch before jumping to the
	# tzcnt.
	jz 21f
	# there is at least one bit set so we found our zero now we have to calculate the length.
	# we don't need the mask anymore just need to know how many trailing zeros it has.
	tzcnt rax, rdx
	jmp 90f

	# if we're here the first 64-byte did not contain a zero.
	# and rdi is 64-byte aligned.

	21:
	add rdi, 64

	# ====================== 64-byte aligned
	20:
	test rdi, 127
	jz 30f
	# mask = to_mask(cmp(*ptr, ZERO))
	vmovdqa32 zmm1, [rdi]
	vpcmpb k1, zmm1, zmm0, 0
	kortestq k1, k1
	jnz 70f
	add rdi, 64

	# ===================== 128-byte aligned
	30:
	test rdi, 255
	jz 40f
	vmovdqa32 zmm1, [rdi]
	vmovdqa32 zmm2, [rdi + 64]
	vpcmpb k1, zmm1, zmm0, 0
	vpcmpb k2, zmm2, zmm0, 0
	kortestq k1, k2
	jnz 60f
	add rdi, 128

	# ===================== 256-byte aligned
	40:
	# Chunk 1 or 2
	vmovdqa32 zmm1, [rdi]
	vmovdqa32 zmm2, [rdi + 64]
	vpcmpb k1, zmm1, zmm0, 0
	vpcmpb k2, zmm2, zmm0, 0
	kortestq k1, k2
	jnz 60f
	# Chunk 3 or 4
	vmovdqa32 zmm1, [rdi + 128]
	vmovdqa32 zmm2, [rdi + 192]
	vpcmpb k1, zmm1, zmm0, 0
	vpcmpb k2, zmm2, zmm0, 0
	kortestq k1, k2
	jnz 50f

	45:
	add rdi, 256
	jmp 40b

	# ================================== END
	# ==== Found in k1 or k2 in chunk 3 or 4
	50:
	kortestq k1, k1
	jz 52f
	# Found in chunk 3
	add rdi, 128
	jmp 70f
	# Found in chunk 4
	52:
	kmovq rdx, k2
	tzcnt rdx, rdx
	add rdi, 192
	jmp 80f

	# ==== Found in k1 or k2 in chunk 1 or 2
	60:
	kortestq k1, k1
	jz 62f
	# Found in chunk 1
	jmp 70f
	# Found in chunk 2
	62:
	kmovq rdx, k2
	tzcnt rdx, rdx
	add rdi, 64
	jmp 80f

	# ==== Found in k1 in chunk 1
	70:
	kmovq rdx, k1
	tzcnt rdx, rdx

	# length = (ptr:rdi - original_ptr:rax) + pos_one:rdx
	80:
	sub rdi, rax
	add rdi, rdx
	mov rax, rdi

	90:
	vzeroupper