Last active
June 18, 2026 22:37
-
-
Save qti3e/6cc4bd37434dc496b61bb3f7cd000548 to your computer and use it in GitHub Desktop.
JOE's strlen
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License. | |
| //! Runtime CPU feature detection. The `setup` function from this code must be called at the very | |
| //! start of the process initialization. | |
| use core::fmt::Debug; | |
| use core::fmt::Formatter; | |
| use core::sync::atomic::AtomicU64; | |
| use core::sync::atomic::Ordering; | |
| static CACHE: AtomicU64 = AtomicU64::new(0); | |
| #[cfg(target_arch = "x86_64")] | |
| pub(super) fn setup() { | |
| use core::arch::x86_64::__cpuid; | |
| use core::arch::x86_64::__cpuid_count; | |
| use core::arch::x86_64::_xgetbv; | |
| use core::arch::x86_64::CpuidResult; | |
| #[inline] | |
| fn maybe(out: &mut u64, f: Feature, reg: u32, bit: u8) { | |
| if reg & (1 << bit) != 0 { | |
| *out |= 1 << (f as u8); | |
| } | |
| } | |
| let max_basic_leaf = unsafe { __cpuid(0) }.eax; | |
| if max_basic_leaf < 1 { | |
| return; | |
| } | |
| let mut out = 0u64; | |
| // CPUID leaf 1: processor info and feature bits | |
| let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(1) }; | |
| // EDX features | |
| maybe(&mut out, Feature::sse2, edx, 26); | |
| // ECX features | |
| maybe(&mut out, Feature::sse3, ecx, 0); | |
| maybe(&mut out, Feature::sse4_1, ecx, 19); | |
| maybe(&mut out, Feature::sse4_2, ecx, 20); | |
| maybe(&mut out, Feature::popcnt, ecx, 23); | |
| maybe(&mut out, Feature::rdrand, ecx, 30); | |
| // For AVX/AVX2/AVX512, we need: | |
| // 1. OSXSAVE (ECX bit 27) - OS has enabled XSAVE | |
| // 2. XCR0 bits to verify OS saves the relevant state | |
| let osxsave = ecx & (1 << 27) != 0; | |
| if osxsave && max_basic_leaf >= 7 { | |
| // XCR0 register via XGETBV | |
| let xcr0 = unsafe { _xgetbv(0) }; | |
| // XCR0[2:1] = 11b means OS saves XMM and YMM state (required for AVX) | |
| let avx_os_support = (xcr0 & 0b110) == 0b110; | |
| // XCR0[7:5] = 111b means OS saves opmask, ZMM_Hi256, Hi16_ZMM (required for AVX512) | |
| let avx512_os_support = avx_os_support && (xcr0 & 0b1110_0000) == 0b1110_0000; | |
| let CpuidResult { ebx, .. } = unsafe { __cpuid_count(7, 0) }; | |
| maybe(&mut out, Feature::bmi2, ebx, 8); | |
| if avx_os_support { | |
| maybe(&mut out, Feature::avx2, ebx, 5); | |
| } | |
| if avx512_os_support { | |
| maybe(&mut out, Feature::avx512f, ebx, 16); | |
| maybe(&mut out, Feature::avx512bw, ebx, 30); | |
| maybe(&mut out, Feature::avx512vl, ebx, 31); | |
| } | |
| } | |
| CACHE.store(out, Ordering::Release); | |
| } | |
| #[cfg(not(target_arch = "x86_64"))] | |
| pub fn setup() { | |
| // NOOP for now. | |
| } | |
| #[cfg(target_arch = "x86_64")] | |
| #[derive(Copy, Clone, Debug)] | |
| #[allow(non_camel_case_types)] | |
| pub enum Feature { | |
| sse2, | |
| sse3, | |
| sse4_1, | |
| sse4_2, | |
| popcnt, | |
| rdrand, | |
| avx2, | |
| avx512f, | |
| avx512bw, | |
| avx512vl, | |
| bmi2, | |
| } | |
| #[derive(Copy, Clone, Debug)] | |
| #[cfg(not(target_arch = "x86_64"))] | |
| pub enum Feature {} | |
| /// A set of features that can be detected either during compile time or runtime. | |
| /// | |
| /// Note: Try to construct your [`FeatureSet`] as a `const MY_FEATURE_SET: ...` first and then use | |
| /// `.detected()` on it, this will help the compiler with more aggressive dead code elimination in | |
| /// case all of the features are known during compile time. | |
| #[derive(Copy, Clone)] | |
| pub struct FeatureSet(u64); | |
| impl FeatureSet { | |
| pub const EMPTY: Self = Self(0); | |
| /// Returns the set of detected features. | |
| pub fn active() -> Self { | |
| Self(CACHE.load(Ordering::Acquire)) | |
| } | |
| /// Returns the set of features that were active during compile time. | |
| pub const fn compile_time() -> Self { | |
| const fn visit(n: u64, feature: Feature) -> u64 { | |
| if feature.check_cfg() { | |
| n | (1 << (feature as u8)) | |
| } else { | |
| n | |
| } | |
| } | |
| #[cfg(target_arch = "x86_64")] | |
| let n = { | |
| let mut n: u64 = 0; | |
| n = visit(n, Feature::sse2); | |
| n = visit(n, Feature::sse3); | |
| n = visit(n, Feature::sse4_1); | |
| n = visit(n, Feature::sse4_2); | |
| n = visit(n, Feature::popcnt); | |
| n = visit(n, Feature::rdrand); | |
| n = visit(n, Feature::avx2); | |
| n = visit(n, Feature::avx512f); | |
| n = visit(n, Feature::avx512bw); | |
| n = visit(n, Feature::avx512vl); | |
| n | |
| }; | |
| #[cfg(not(target_arch = "x86_64"))] | |
| let n = 0u64; | |
| Self(n) | |
| } | |
| #[inline(always)] | |
| pub const fn new(feature: Feature) -> Self { | |
| Self::EMPTY.add(feature) | |
| } | |
| #[inline(always)] | |
| pub const fn add(self, feature: Feature) -> Self { | |
| // Avoid inserting the features we know are present at compile time | |
| // into the feature set. `detected` for an empty feature set will | |
| // always return true and inlines into: | |
| // | |
| // ```plain | |
| // let x = unsafe { CACHE }; | |
| // (x & 0) == 0 | |
| // ``` | |
| // | |
| // And the compiler can trivially see that this will be true for any | |
| // value in the CACHE which will basically be replaced by a constant | |
| // and nice things will have and branches will be eliminated. | |
| // | |
| // Which is really nice for `-Ctarget-cpu=native` builds. | |
| if feature.check_cfg() { | |
| return self; | |
| } | |
| let n = 1u64 << (feature as u8); | |
| Self(self.0 | n) | |
| } | |
| /// Returns `true` if all of the features in this feature set are found. | |
| #[inline(always)] | |
| pub fn detected(self) -> bool { | |
| if cfg!(feature = "no-rt-feature-detect") { | |
| return self.0 == 0; | |
| } | |
| // Note: Currently LLVM is capable to realize that `setup` strictly dominates | |
| // `detected` and that `CACHE` is a runtime constant and it treats it as such | |
| // which means `detected()` gets hoisted and loops using it can get unrolled. | |
| // | |
| // This is a hack I was going to do if that wasn't the case. I'm gonna leave | |
| // it here in case this changes. | |
| // | |
| // cfg_match! { | |
| // // This is a hack | |
| // target_arch = "x86_64" => { | |
| // unsafe { | |
| // asm!("mov {0:r}, [{1}]", | |
| // lateout(reg) x, | |
| // sym CACHE, | |
| // options(nomem, nostack, pure)) | |
| // } | |
| // } | |
| // _ => { | |
| // x = unsafe { CACHE } | |
| // } | |
| // } | |
| let x = CACHE.load(Ordering::Acquire); | |
| (x & self.0) == self.0 | |
| } | |
| } | |
| impl Feature { | |
| #[inline(always)] | |
| #[cfg(target_arch = "x86_64")] | |
| const fn check_cfg(self) -> bool { | |
| match self { | |
| Feature::sse2 => cfg!(target_feature = "sse2"), | |
| Feature::sse3 => cfg!(target_feature = "sse3"), | |
| Feature::sse4_1 => cfg!(target_feature = "sse4.1"), | |
| Feature::sse4_2 => cfg!(target_feature = "sse4.2"), | |
| Feature::popcnt => cfg!(target_feature = "popcnt"), | |
| Feature::rdrand => cfg!(target_feature = "rdrand"), | |
| Feature::avx2 => cfg!(target_feature = "avx2"), | |
| Feature::avx512f => cfg!(target_feature = "avx512f"), | |
| Feature::avx512bw => cfg!(target_feature = "avx512bw"), | |
| Feature::avx512vl => cfg!(target_feature = "avx512vl"), | |
| Feature::bmi2 => cfg!(target_feature = "bmi2"), | |
| } | |
| } | |
| #[inline(always)] | |
| #[cfg(not(target_arch = "x86_64"))] | |
| const fn check_cfg(self) -> bool { | |
| true | |
| } | |
| } | |
| impl Debug for FeatureSet { | |
| fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { | |
| let mut s = f.debug_set(); | |
| #[cfg(target_arch = "x86_64")] | |
| { | |
| if self.0 & (1 << (Feature::sse2 as u8)) != 0 { | |
| s.entry(&Feature::sse2); | |
| } | |
| if self.0 & (1 << (Feature::sse3 as u8)) != 0 { | |
| s.entry(&Feature::sse3); | |
| } | |
| if self.0 & (1 << (Feature::sse4_1 as u8)) != 0 { | |
| s.entry(&Feature::sse4_1); | |
| } | |
| if self.0 & (1 << (Feature::sse4_2 as u8)) != 0 { | |
| s.entry(&Feature::sse4_2); | |
| } | |
| if self.0 & (1 << (Feature::popcnt as u8)) != 0 { | |
| s.entry(&Feature::popcnt); | |
| } | |
| if self.0 & (1 << (Feature::rdrand as u8)) != 0 { | |
| s.entry(&Feature::rdrand); | |
| } | |
| if self.0 & (1 << (Feature::avx2 as u8)) != 0 { | |
| s.entry(&Feature::avx2); | |
| } | |
| if self.0 & (1 << (Feature::avx512f as u8)) != 0 { | |
| s.entry(&Feature::avx512f); | |
| } | |
| if self.0 & (1 << (Feature::avx512bw as u8)) != 0 { | |
| s.entry(&Feature::avx512bw); | |
| } | |
| if self.0 & (1 << (Feature::avx512vl as u8)) != 0 { | |
| s.entry(&Feature::avx512vl); | |
| } | |
| if self.0 & (1 << (Feature::bmi2 as u8)) != 0 { | |
| s.entry(&Feature::bmi2); | |
| } | |
| } | |
| s.finish() | |
| } | |
| } | |
| /// CPU feature-based function dispatch, like GNU ifunc but in pure Rust. | |
| /// | |
| /// On first call, detects CPU features and selects the best implementation. | |
| /// Subsequent calls go directly through a cached function pointer. | |
| /// | |
| /// # Example | |
| /// | |
| /// ```ignore | |
| /// cpufn! { | |
| /// /// Compute string length with CPU-specific optimizations. | |
| /// pub unsafe fn strlen(s: *const u8) -> usize { | |
| /// #[cfg(target_arch = "x86_64")] | |
| /// [avx512bw, avx512vl] => strlen_avx512, | |
| /// #[cfg(target_arch = "x86_64")] | |
| /// [avx2] => strlen_avx2, | |
| /// _ => strlen_generic, | |
| /// } | |
| /// } | |
| /// ``` | |
| #[macro_export] | |
| macro_rules! cpufn { | |
| // Safe fn variant | |
| ( | |
| $(#[$fn_attr:meta])* | |
| $vis:vis fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? { | |
| $( | |
| $(#[$arm_attr:meta])* | |
| [$($feature:ident),*] => $impl:path, | |
| )* | |
| _ => $default:path $(,)? | |
| } | |
| ) => { | |
| $(#[$fn_attr])* | |
| #[allow(unused_attributes)] // #[inline] ignored on #[no_mangle] | |
| #[inline(always)] | |
| $vis fn $name( $($arg : $argty),* ) $(-> $ret)? { | |
| use core::sync::atomic::AtomicPtr; | |
| use core::sync::atomic::Ordering; | |
| type Fn = fn( $($argty),* ) $(-> $ret)?; | |
| static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ()); | |
| #[cold] | |
| #[inline(never)] | |
| #[allow(unused)] | |
| fn resolve( $($arg : $argty),* ) $(-> $ret)? { | |
| use $crate::sys::detect::Feature; | |
| use $crate::sys::detect::FeatureSet; | |
| let f: Fn = 'select: { | |
| $( | |
| $(#[$arm_attr])* | |
| if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() { | |
| break 'select $impl; | |
| } | |
| )* | |
| $default | |
| }; | |
| F.store(f as *const () as *mut (), Ordering::Release); | |
| f( $($arg),* ) | |
| } | |
| let f: Fn = unsafe { | |
| core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire)) | |
| }; | |
| f( $($arg),* ) | |
| } | |
| }; | |
| // Unsafe fn variant | |
| ( | |
| $(#[$fn_attr:meta])* | |
| $vis:vis unsafe fn $name:ident ( $($arg:ident : $argty:ty),* $(,)? ) $(-> $ret:ty)? { | |
| $( | |
| $(#[$arm_attr:meta])* | |
| [$($feature:ident),*] => $impl:path, | |
| )* | |
| _ => $default:path $(,)? | |
| } | |
| ) => { | |
| $(#[$fn_attr])* | |
| #[allow(unused_attributes)] // #[inline] ignored on #[no_mangle] | |
| #[inline(always)] | |
| $vis unsafe fn $name( $($arg : $argty),* ) $(-> $ret)? { | |
| use core::sync::atomic::AtomicPtr; | |
| use core::sync::atomic::Ordering; | |
| type Fn = unsafe fn( $($argty),* ) $(-> $ret)?; | |
| static F: AtomicPtr<()> = AtomicPtr::new(resolve as *const () as *mut ()); | |
| #[cold] | |
| #[inline(never)] | |
| #[allow(unused)] | |
| unsafe fn resolve( $($arg : $argty),* ) $(-> $ret)? { | |
| use $crate::sys::detect::Feature; | |
| use $crate::sys::detect::FeatureSet; | |
| let f: Fn = 'select: { | |
| $( | |
| $(#[$arm_attr])* | |
| if FeatureSet::EMPTY $(.add(Feature::$feature))* .detected() { | |
| break 'select $impl; | |
| } | |
| )* | |
| $default | |
| }; | |
| F.store(f as *const () as *mut (), Ordering::Release); | |
| f( $($arg),* ) | |
| } | |
| let f: Fn = core::mem::transmute::<*mut (), Fn>(F.load(Ordering::Acquire)); | |
| f( $($arg),* ) | |
| } | |
| }; | |
| } | |
| #[cfg(test)] | |
| mod tests { | |
| fn add_generic(a: i32, b: i32) -> i32 { | |
| a + b | |
| } | |
| #[cfg(target_arch = "x86_64")] | |
| fn add_avx2(a: i32, b: i32) -> i32 { | |
| a + b + 1000 // marker to prove we picked this one | |
| } | |
| cpufn! { | |
| /// Test function for cpufn dispatch. | |
| fn test_add(a: i32, b: i32) -> i32 { | |
| #[cfg(target_arch = "x86_64")] | |
| [avx2] => add_avx2, | |
| _ => add_generic, | |
| } | |
| } | |
| #[test] | |
| fn test_cpufn_dispatch() { | |
| let result = test_add(2, 3); | |
| #[cfg(target_arch = "x86_64")] | |
| if super::FeatureSet::new(super::Feature::avx2).detected() { | |
| assert_eq!(result, 1005); // 2 + 3 + 1000 | |
| } else { | |
| assert_eq!(result, 5); | |
| } | |
| #[cfg(not(target_arch = "x86_64"))] | |
| assert_eq!(result, 5); | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // © 2025 Parsa Ghadimi. MIT OR Apache-2.0 License. | |
| use core::ffi::c_char; | |
| use crate::cpufn; | |
| cpufn! { | |
| #[unsafe(no_mangle)] | |
| pub unsafe fn strlen(s: *const c_char) -> usize { | |
| #[cfg(target_arch = "x86_64")] | |
| [avx512f, avx512bw] => strlen_avx512, | |
| #[cfg(target_arch = "x86_64")] | |
| [avx2] => strlen_avx2, | |
| _ => strlen_magic, | |
| } | |
| } | |
| unsafe fn strlen_magic(str: *const c_char) -> usize { | |
| let mut s = str; | |
| while (s as usize) & 7 != 0 { | |
| if *s == 0 { | |
| return s.offset_from_unsigned(str); | |
| } | |
| s = s.add(1); | |
| } | |
| // Now use the glibc/Sean Eron Anderson trick to scan word at a time. | |
| // We assume 64-bit systems in Joe's crt. | |
| let mut w: *const u64 = s as *const _; | |
| loop { | |
| let v = *w; | |
| // Check if the word has a zero in it. It may also have a 0x80 at the high byte. | |
| let maybe_has_zero: u64 = (v.wrapping_add(0x7efefefefefefeff) ^ !v) & 0x8101010101010100; | |
| if maybe_has_zero != 0 { | |
| let ch: *const c_char = w as *const _; | |
| if *ch == 0 { | |
| return ch.offset_from_unsigned(str); | |
| } else if *ch.add(1) == 0 { | |
| return ch.offset_from_unsigned(str) + 1; | |
| } else if *ch.add(2) == 0 { | |
| return ch.offset_from_unsigned(str) + 2; | |
| } else if *ch.add(3) == 0 { | |
| return ch.offset_from_unsigned(str) + 3; | |
| } else if *ch.add(4) == 0 { | |
| return ch.offset_from_unsigned(str) + 4; | |
| } else if *ch.add(5) == 0 { | |
| return ch.offset_from_unsigned(str) + 5; | |
| } else if *ch.add(6) == 0 { | |
| return ch.offset_from_unsigned(str) + 6; | |
| } else if *ch.add(7) == 0 { | |
| return ch.offset_from_unsigned(str) + 7; | |
| } | |
| } | |
| // load the next word. | |
| w = w.add(1); | |
| } | |
| // TODO: Maybe not a bad idea to do the above loop until ptr is 32-byte aligned and then use | |
| // the avx code? | |
| } | |
| #[cfg(target_arch = "x86_64")] | |
| unsafe fn strlen_avx2(str: *const c_char) -> usize { | |
| let out: usize; | |
| core::arch::asm!( | |
| include_str!("./strlen_x86_64_avx2.s"), | |
| in("rdi") str, | |
| lateout("rax") out, | |
| lateout("ymm0") _, | |
| lateout("ymm1") _, | |
| lateout("ymm2") _, | |
| lateout("rdi") _, | |
| lateout("rdx") _, | |
| lateout("rcx") _, | |
| options(nostack, readonly) | |
| ); | |
| out | |
| } | |
| #[cfg(target_arch = "x86_64")] | |
| unsafe fn strlen_avx512(str: *const c_char) -> usize { | |
| let out: usize; | |
| core::arch::asm!( | |
| include_str!("./strlen_x86_64_avx512.s"), | |
| in("rdi") str, | |
| lateout("rax") out, | |
| lateout("zmm0") _, | |
| lateout("zmm1") _, | |
| lateout("zmm2") _, | |
| lateout("rdi") _, | |
| lateout("rdx") _, | |
| lateout("rcx") _, | |
| lateout("k1") _, | |
| lateout("k2") _, | |
| options(nostack, readonly) | |
| ); | |
| out | |
| } | |
| #[cfg(bench)] | |
| #[cfg(target_arch = "x86_64")] | |
| mod bench { | |
| use super::*; | |
| use crate::selftest::micro::*; | |
| #[bench] | |
| fn strlen_bench() { | |
| use crate::sys::detect::Feature; | |
| use crate::sys::detect::FeatureSet; | |
| let mut buffer = [1; 1024 * 8]; | |
| buffer[buffer.len() - 1] = 0; | |
| let mut size = 1; | |
| const AVX2: FeatureSet = FeatureSet::new(Feature::avx2); | |
| const AVX512: FeatureSet = FeatureSet::EMPTY | |
| .add(Feature::avx512f) | |
| .add(Feature::avx512bw); | |
| struct Input { | |
| len: usize, | |
| ptr: *const i8, | |
| } | |
| let input = empty::<Input>() | |
| .set_throughput("byte", |v| v.len) | |
| .chain_with(move || { | |
| buffer[size - 1] = 32; | |
| size *= 2; | |
| if size <= buffer.len() { | |
| buffer[size - 1] = 0; | |
| Some(Input { | |
| len: size - 1, | |
| ptr: buffer.as_ptr(), | |
| }) | |
| } else { | |
| None | |
| } | |
| }); | |
| let fns = &[ | |
| BenchFn::new("avx2", |input: &Input| unsafe { strlen_avx2(input.ptr) }) | |
| .skip_if(!AVX2.detected()), | |
| BenchFn::new("avx512", |input: &Input| unsafe { | |
| strlen_avx512(input.ptr) | |
| }) | |
| .skip_if(!AVX512.detected()), | |
| BenchFn::new("magic", |input| unsafe { strlen_magic(input.ptr) }), | |
| ]; | |
| BenchConfig::default().set_name("strlen").run(input, fns); | |
| } | |
| } | |
| #[cfg(test)] | |
| mod test { | |
| use core::ops::Deref; | |
| use core::ops::DerefMut; | |
| use super::*; | |
| const CAP: usize = 1024 * 5; | |
| #[repr(align(512))] | |
| struct Buffer([u8; CAP]); | |
| impl Deref for Buffer { | |
| type Target = [u8]; | |
| fn deref(&self) -> &Self::Target { | |
| &self.0 | |
| } | |
| } | |
| impl DerefMut for Buffer { | |
| fn deref_mut(&mut self) -> &mut Self::Target { | |
| &mut self.0 | |
| } | |
| } | |
| #[inline(never)] | |
| fn test_strlen_impl(name: &str, f: impl Fn(*const c_char) -> usize) { | |
| let mut alloc = Buffer([0; CAP]); | |
| assert_eq!((alloc.0.as_ptr() as usize) % 64, 0); | |
| let test_cases = [ | |
| 257, 511, 512, 513, 1023, 1024, 1025, 2047, 2048, 2049, 4095, 4096, 4097, 4098, 5000, | |
| ]; | |
| for align_offset in 0..=512 { | |
| for len in (0..=256).chain(test_cases) { | |
| if align_offset + len + 1 >= alloc.len() { | |
| continue; // Skip out-of-bound cases | |
| } | |
| unsafe { | |
| assert_eq!( | |
| (alloc.0.as_ptr().add(align_offset) as usize) % 64, | |
| align_offset % 64, | |
| "failed for align={align_offset}" | |
| ); | |
| } | |
| // Fill with non-zero data | |
| for b in alloc.iter_mut() { | |
| *b = 1; | |
| } | |
| // Set the bytes before the input to zero, this tests implementations that | |
| // load an entire aligned chunk of data in initial step. | |
| for i in 0..align_offset { | |
| alloc[i] = 0; | |
| } | |
| // Set null terminator at the correct location | |
| alloc[align_offset + len] = 0; | |
| alloc[align_offset + len + 1] = 0; | |
| // Pointer to the beginning of the test string | |
| let s = unsafe { alloc.as_ptr().add(align_offset) as *const c_char }; | |
| let result = f(s); | |
| assert_eq!( | |
| result, | |
| len, | |
| "Failed for align={} align_offset={}, len={}, got={}", | |
| unsafe { (alloc.0.as_ptr().add(align_offset) as usize) % 512 }, | |
| align_offset, | |
| len, | |
| result | |
| ); | |
| } | |
| } | |
| for i in 0..512 { | |
| alloc[i] = 0; | |
| let s = unsafe { alloc.as_ptr().add(i) as *const c_char }; | |
| let result = f(s); | |
| assert_eq!( | |
| result, 0, | |
| "{name}: Failed zero-terminator check at offset {}: got {}", | |
| i, result | |
| ); | |
| } | |
| } | |
| #[test] | |
| fn test_strlen_magic() { | |
| test_strlen_impl("magic", |x| unsafe { strlen_magic(x) }); | |
| } | |
| #[test] | |
| #[cfg(target_arch = "x86_64")] | |
| fn test_strlen_x86_avx2() { | |
| use crate::sys::detect::Feature; | |
| use crate::sys::detect::FeatureSet; | |
| const AVX2: FeatureSet = FeatureSet::new(Feature::avx2); | |
| if !AVX2.detected() { | |
| return crate::selftest::ignore(); | |
| } | |
| test_strlen_impl("x86_64-avx2", |x| unsafe { strlen_avx2(x) }); | |
| } | |
| #[test] | |
| #[cfg(target_arch = "x86_64")] | |
| fn test_strlen_x86_avx512() { | |
| use crate::sys::detect::Feature; | |
| use crate::sys::detect::FeatureSet; | |
| const AVX512: FeatureSet = FeatureSet::EMPTY | |
| .add(Feature::avx512f) | |
| .add(Feature::avx512bw); | |
| if !AVX512.detected() { | |
| return crate::selftest::ignore(); | |
| } | |
| test_strlen_impl("x86_64-avx512", |x| unsafe { strlen_avx512(x) }); | |
| } | |
| #[test] | |
| fn test_strlen_default() { | |
| test_strlen_impl("default", |x| unsafe { strlen(x) }); | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Registers used: | |
| # ymm0 -> constant zero | |
| # ymm1 -> loaded value AND result of compare against zero | |
| # ymm2 -> loaded value AND result of compare against zero | |
| # | |
| # rax -> the original input ptr AND the return value | |
| # rdi -> our 32-byte aligned pointer which moves forward | |
| # | |
| # edx -> the finalized simd mask AND tzcnt of the mask | |
| # ecx -> alignment padding = original_ptr % 32 | |
| # Keep the original input in rax we don't need it until computing the return value. | |
| mov rax, rdi | |
| # n = rcx, ecx = (original ptr) & 31 ; our initial alignment padding. | |
| mov rcx, rdi | |
| and rcx, 31 | |
| # ymm0 = ZERO | |
| vpxor ymm0, ymm0, ymm0 | |
| # If already 32-byte aligned skip the partial step | |
| test rcx, rcx | |
| jz 20f | |
| # align the ptr to 32-byte boundary | |
| and rdi, -32 | |
| # mask = to_mask(cmp(*ptr, ZERO)) | |
| vmovdqa ymm1, [rdi] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpmovmskb edx, ymm1 | |
| # now we have the first 32-bytes in the memory but this may include some other | |
| # bytes that appear before the `str`. So we need to clear some of the LSBs | |
| # mask >>= rcx | |
| shr edx, cl | |
| # if the mask is zero then go to the loop there were no zeros here | |
| test edx, edx | |
| jz 21f | |
| # there is at least one bit set so we found our zero now we have to calculate the length. | |
| tzcnt eax, edx | |
| jmp 90f | |
| # if we're here the first 32-byte did not contain a zero. | |
| # and rdi is 32-byte aligned. | |
| 21: | |
| add rdi, 32 | |
| # ====================== 32-byte aligned | |
| 20: | |
| test rdi, 63 | |
| jz 30f | |
| # mask = to_mask(cmp(*ptr, ZERO)) | |
| vmovdqa ymm1, [rdi] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpmovmskb edx, ymm1 | |
| test edx, edx | |
| jnz 70f | |
| add rdi, 32 | |
| # ===================== 64-byte aligned | |
| 30: | |
| test rdi, 127 | |
| jz 40f | |
| vmovdqa ymm1, [rdi] | |
| vmovdqa ymm2, [rdi + 32] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpcmpeqb ymm2, ymm2, ymm0 | |
| vpmovmskb edx, ymm1 | |
| vpmovmskb ecx, ymm2 | |
| or edx, ecx | |
| jnz 60f | |
| add rdi, 64 | |
| # ===================== 128-byte aligned | |
| 40: | |
| # Chunk 1 or 2 | |
| vmovdqa ymm1, [rdi] | |
| vmovdqa ymm2, [rdi + 32] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpcmpeqb ymm2, ymm2, ymm0 | |
| vpmovmskb edx, ymm1 | |
| vpmovmskb ecx, ymm2 | |
| or edx, ecx | |
| jnz 60f | |
| # Chunk 3 or 4 | |
| vmovdqa ymm1, [rdi + 64] | |
| vmovdqa ymm2, [rdi + 96] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpcmpeqb ymm2, ymm2, ymm0 | |
| vpmovmskb edx, ymm1 | |
| vpmovmskb ecx, ymm2 | |
| or edx, ecx | |
| jnz 50f | |
| 45: | |
| add rdi, 128 | |
| jmp 40b | |
| # ================================== END | |
| # ==== Found in ymm1 or ymm2 in chunk 3 or 4 | |
| 50: | |
| # Re-extract masks since we lost them in the 'or' | |
| vmovdqa ymm1, [rdi + 64] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpmovmskb edx, ymm1 | |
| test edx, edx | |
| jz 52f | |
| # Found in chunk 3 | |
| add rdi, 64 | |
| jmp 70f | |
| # Found in chunk 4 | |
| 52: | |
| vmovdqa ymm2, [rdi + 96] | |
| vpcmpeqb ymm2, ymm2, ymm0 | |
| vpmovmskb edx, ymm2 | |
| tzcnt edx, edx | |
| add rdi, 96 | |
| jmp 80f | |
| # ==== Found in ymm1 or ymm2 in chunk 1 or 2 | |
| 60: | |
| # Re-extract masks since we lost them in the 'or' | |
| vmovdqa ymm1, [rdi] | |
| vpcmpeqb ymm1, ymm1, ymm0 | |
| vpmovmskb edx, ymm1 | |
| test edx, edx | |
| jz 62f | |
| # Found in chunk 1 | |
| jmp 70f | |
| # Found in chunk 2 | |
| 62: | |
| vmovdqa ymm2, [rdi + 32] | |
| vpcmpeqb ymm2, ymm2, ymm0 | |
| vpmovmskb edx, ymm2 | |
| tzcnt edx, edx | |
| add rdi, 32 | |
| jmp 80f | |
| # ==== Found in ymm1 in chunk 1 | |
| 70: | |
| tzcnt edx, edx | |
| # length = (ptr:rdi - original_ptr:rax) + pos_one:rdx | |
| 80: | |
| sub rdi, rax | |
| add rdi, rdx | |
| mov rax, rdi | |
| 90: | |
| vzeroupper |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Registers used: | |
| # zmm0 -> constant zero | |
| # zmm1 -> loaded value AND result of compare against zero | |
| # zmm2 -> loaded value AND result of compare against zero | |
| # | |
| # rax -> the original input ptr AND the return value | |
| # rdi -> our 64-byte aligned pointer which moves forward | |
| # | |
| # edx -> the finalized simd mask AND tzcnt of the mask | |
| # ecx -> alignment padding = original_ptr % 64 | |
| # Keep the original input in rax we don't need it until computing the return value. | |
| mov rax, rdi | |
| # n = rcx, ecx = (original ptr) & 63 ; our initial alignment padding. | |
| mov rcx, rdi | |
| and rcx, 63 | |
| # zmm0 = ZERO | |
| vpxord zmm0, zmm0, zmm0 | |
| # If already 64-byte aligned skip the partial step, which could lead us | |
| # to the 256-byte aligned sooner. | |
| test rcx, rcx | |
| jz 20f | |
| # TODO: This step could be improved to avoid loeading too much | |
| # garbage data. | |
| # align the ptr to 64-byte boundary | |
| and rdi, -64 | |
| # mask = to_mask(cmp(*ptr, ZERO)) | |
| vmovdqa32 zmm1, [rdi] | |
| vpcmpb k1, zmm1, zmm0, 0 | |
| kmovq rdx, k1 | |
| # now we have the first 64-bytes in the memory but this may include some other | |
| # bytes that appear before the `str`. So we need to clear some of the LSBs | |
| # mask >>= rcx | |
| shr rdx, cl | |
| # if the mask is zero then go to the loop there were no zeros here | |
| test rdx, rdx | |
| # NOTE we don't do runtime target detection for tzcnt, and if not supported it decodes | |
| # to `bsr` and bsr and tzcnt are the same if the input is not zero. | |
| # so we do this explicit check against zero here in this branch before jumping to the | |
| # tzcnt. | |
| jz 21f | |
| # there is at least one bit set so we found our zero now we have to calculate the length. | |
| # we don't need the mask anymore just need to know how many trailing zeros it has. | |
| tzcnt rax, rdx | |
| jmp 90f | |
| # if we're here the first 64-byte did not contain a zero. | |
| # and rdi is 64-byte aligned. | |
| 21: | |
| add rdi, 64 | |
| # ====================== 64-byte aligned | |
| 20: | |
| test rdi, 127 | |
| jz 30f | |
| # mask = to_mask(cmp(*ptr, ZERO)) | |
| vmovdqa32 zmm1, [rdi] | |
| vpcmpb k1, zmm1, zmm0, 0 | |
| kortestq k1, k1 | |
| jnz 70f | |
| add rdi, 64 | |
| # ===================== 128-byte aligned | |
| 30: | |
| test rdi, 255 | |
| jz 40f | |
| vmovdqa32 zmm1, [rdi] | |
| vmovdqa32 zmm2, [rdi + 64] | |
| vpcmpb k1, zmm1, zmm0, 0 | |
| vpcmpb k2, zmm2, zmm0, 0 | |
| kortestq k1, k2 | |
| jnz 60f | |
| add rdi, 128 | |
| # ===================== 256-byte aligned | |
| 40: | |
| # Chunk 1 or 2 | |
| vmovdqa32 zmm1, [rdi] | |
| vmovdqa32 zmm2, [rdi + 64] | |
| vpcmpb k1, zmm1, zmm0, 0 | |
| vpcmpb k2, zmm2, zmm0, 0 | |
| kortestq k1, k2 | |
| jnz 60f | |
| # Chunk 3 or 4 | |
| vmovdqa32 zmm1, [rdi + 128] | |
| vmovdqa32 zmm2, [rdi + 192] | |
| vpcmpb k1, zmm1, zmm0, 0 | |
| vpcmpb k2, zmm2, zmm0, 0 | |
| kortestq k1, k2 | |
| jnz 50f | |
| 45: | |
| add rdi, 256 | |
| jmp 40b | |
| # ================================== END | |
| # ==== Found in k1 or k2 in chunk 3 or 4 | |
| 50: | |
| kortestq k1, k1 | |
| jz 52f | |
| # Found in chunk 3 | |
| add rdi, 128 | |
| jmp 70f | |
| # Found in chunk 4 | |
| 52: | |
| kmovq rdx, k2 | |
| tzcnt rdx, rdx | |
| add rdi, 192 | |
| jmp 80f | |
| # ==== Found in k1 or k2 in chunk 1 or 2 | |
| 60: | |
| kortestq k1, k1 | |
| jz 62f | |
| # Found in chunk 1 | |
| jmp 70f | |
| # Found in chunk 2 | |
| 62: | |
| kmovq rdx, k2 | |
| tzcnt rdx, rdx | |
| add rdi, 64 | |
| jmp 80f | |
| # ==== Found in k1 in chunk 1 | |
| 70: | |
| kmovq rdx, k1 | |
| tzcnt rdx, rdx | |
| # length = (ptr:rdi - original_ptr:rax) + pos_one:rdx | |
| 80: | |
| sub rdi, rax | |
| add rdi, rdx | |
| mov rax, rdi | |
| 90: | |
| vzeroupper |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment