Created
August 4, 2019 20:52
-
-
Save Alexhuszagh/c549883835878cb88649b7d340c7849c to your computer and use it in GitHub Desktop.
memchr implementation with portable SIMD instructions using packed_simd. Nightly-only.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is derived off of BurntSushi's rust-memchr, and is re-released under | |
// the same licensing conditions as rust-memchr. | |
// | |
// Dual-licensed under MIT or the UNLICENSE. | |
// | |
// LICENSE | |
// ======= | |
// | |
// This is free and unencumbered software released into the public domain. | |
// | |
// Anyone is free to copy, modify, publish, use, compile, sell, or | |
// distribute this software, either in source code form or as a compiled | |
// binary, for any purpose, commercial or non-commercial, and by any | |
// means. | |
// | |
// In jurisdictions that recognize copyright laws, the author or authors | |
// of this software dedicate any and all copyright interest in the | |
// software to the public domain. We make this dedication for the benefit | |
// of the public at large and to the detriment of our heirs and | |
// successors. We intend this dedication to be an overt act of | |
// relinquishment in perpetuity of all present and future rights to this | |
// software under copyright law. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
// OTHER DEALINGS IN THE SOFTWARE. | |
// | |
// For more information, please refer to <http://unlicense.org/> | |
mod private { | |
use core::cmp; | |
use core::mem::size_of; | |
use core::ops::BitOr; | |
use core::slice; | |
use packed_simd::*; | |
const VECTOR_SIZE: usize = size_of::<i8x16>(); | |
const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; | |
// The number of bytes to loop at in one iteration of memchr/memrchr. | |
const LOOP_SIZE: usize = 4 * VECTOR_SIZE; | |
// The number of bytes to loop at in one iteration of memchr2/memrchr2 and | |
// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes | |
// in benchmarks. memchr3 in particular only gets a very slight speed up from | |
// the loop unrolling. | |
const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; | |
pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = start_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr < end_ptr { | |
if *ptr == n1 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
ptr = ptr.offset(1); | |
} | |
return None; | |
} | |
if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | |
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); | |
while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let c = i8x16_load(ptr.add(2 * VECTOR_SIZE)); | |
let d = i8x16_load(ptr.add(3 * VECTOR_SIZE)); | |
let eqa = vn1.eq(a); | |
let eqb = vn1.eq(b); | |
let eqc = vn1.eq(c); | |
let eqd = vn1.eq(d); | |
let or1 = eqa.bitor(eqb); | |
let or2 = eqc.bitor(eqd); | |
let or3 = or1.bitor(or2); | |
if or3.any() { | |
let mut at = sub(ptr, start_ptr); | |
if eqa.any() { | |
return Some(at + forward_pos(eqa.bitmask())) | |
} | |
at += VECTOR_SIZE; | |
if eqb.any() { | |
return Some(at + forward_pos(eqb.bitmask())) | |
} | |
at += VECTOR_SIZE; | |
if eqc.any() { | |
return Some(at + forward_pos(eqc.bitmask())) | |
} | |
at += VECTOR_SIZE; | |
debug_assert!(eqd.bitmask() != 0); | |
return Some(at + forward_pos(eqd.bitmask())); | |
} | |
ptr = ptr.add(loop_size); | |
} | |
while ptr <= end_ptr.sub(VECTOR_SIZE) { | |
debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); | |
if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE); | |
} | |
if ptr < end_ptr { | |
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); | |
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); | |
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); | |
return forward_search1(start_ptr, end_ptr, ptr, vn1); | |
} | |
None | |
} | |
pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let vn2 = i8x16::splat(n2 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE2, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = start_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr < end_ptr { | |
if *ptr == n1 || *ptr == n2 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
ptr = ptr.offset(1); | |
} | |
return None; | |
} | |
if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | |
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); | |
while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let eqa1 = vn1.eq(a); | |
let eqb1 = vn1.eq(b); | |
let eqa2 = vn2.eq(a); | |
let eqb2 = vn2.eq(b); | |
let or1 = eqa1.bitor(eqb1); | |
let or2 = eqa2.bitor(eqb2); | |
let or3 = or1.bitor(or2); | |
if or3.any() { | |
let mut at = sub(ptr, start_ptr); | |
if eqa1.any() || eqa2.any() { | |
return Some(at + forward_pos2(eqa1.bitmask(), eqa2.bitmask())) | |
} | |
at += VECTOR_SIZE; | |
if eqb1.any() || eqb2.any() { | |
return Some(at + forward_pos2(eqb1.bitmask(), eqb2.bitmask())) | |
} | |
} | |
ptr = ptr.add(loop_size); | |
} | |
while ptr <= end_ptr.sub(VECTOR_SIZE) { | |
if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE); | |
} | |
if ptr < end_ptr { | |
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); | |
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); | |
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); | |
return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); | |
} | |
None | |
} | |
pub unsafe fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let vn2 = i8x16::splat(n2 as i8); | |
let vn3 = i8x16::splat(n3 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE2, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = start_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr < end_ptr { | |
if *ptr == n1 || *ptr == n2 || *ptr == n3 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
ptr = ptr.offset(1); | |
} | |
return None; | |
} | |
if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); | |
debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); | |
while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let eqa1 = vn1.eq(a); | |
let eqb1 = vn1.eq(b); | |
let eqa2 = vn2.eq(a); | |
let eqb2 = vn2.eq(b); | |
let eqa3 = vn3.eq(a); | |
let eqb3 = vn3.eq(b); | |
let or1 = eqa1.bitor(eqb1); | |
let or2 = eqa2.bitor(eqb2); | |
let or3 = eqa3.bitor(eqb3); | |
let or4 = or1.bitor(or2); | |
let or5 = or3.bitor(or4); | |
if or5.any() { | |
let mut at = sub(ptr, start_ptr); | |
if eqa1.any() || eqa2.any() || eqa3.any() { | |
return Some(at + forward_pos3(eqa1.bitmask(), eqa2.bitmask(), eqa3.bitmask())) | |
} | |
at += VECTOR_SIZE; | |
if eqb1.any() || eqb2.any() || eqb3.any() { | |
return Some(at + forward_pos3(eqb1.bitmask(), eqb2.bitmask(), eqb3.bitmask())) | |
} | |
} | |
ptr = ptr.add(loop_size); | |
} | |
while ptr <= end_ptr.sub(VECTOR_SIZE) { | |
if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { | |
return Some(i); | |
} | |
ptr = ptr.add(VECTOR_SIZE); | |
} | |
if ptr < end_ptr { | |
debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); | |
ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); | |
debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); | |
return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); | |
} | |
None | |
} | |
pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = end_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr > start_ptr { | |
ptr = ptr.offset(-1); | |
if *ptr == n1 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
} | |
return None; | |
} | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { | |
return Some(i); | |
} | |
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; | |
debug_assert!(start_ptr <= ptr && ptr <= end_ptr); | |
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
ptr = ptr.sub(loop_size); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let c = i8x16_load(ptr.add(2 * VECTOR_SIZE)); | |
let d = i8x16_load(ptr.add(3 * VECTOR_SIZE)); | |
let eqa = vn1.eq(a); | |
let eqb = vn1.eq(b); | |
let eqc = vn1.eq(c); | |
let eqd = vn1.eq(d); | |
let or1 = eqa.bitor(eqb); | |
let or2 = eqc.bitor(eqd); | |
let or3 = or1.bitor(or2); | |
if or3.any() { | |
let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); | |
if eqd.any() { | |
return Some(at + reverse_pos(eqd.bitmask())) | |
} | |
at -= VECTOR_SIZE; | |
if eqc.any() { | |
return Some(at + reverse_pos(eqc.bitmask())) | |
} | |
at -= VECTOR_SIZE; | |
if eqb.any() { | |
return Some(at + reverse_pos(eqb.bitmask())) | |
} | |
at -= VECTOR_SIZE; | |
debug_assert!(eqa.bitmask() != 0); | |
return Some(at + reverse_pos(eqa.bitmask())); | |
} | |
} | |
while ptr >= start_ptr.add(VECTOR_SIZE) { | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { | |
return Some(i); | |
} | |
} | |
if ptr > start_ptr { | |
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); | |
return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); | |
} | |
None | |
} | |
pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let vn2 = i8x16::splat(n2 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE2, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = end_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr > start_ptr { | |
ptr = ptr.offset(-1); | |
if *ptr == n1 || *ptr == n2 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
} | |
return None; | |
} | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { | |
return Some(i); | |
} | |
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; | |
debug_assert!(start_ptr <= ptr && ptr <= end_ptr); | |
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
ptr = ptr.sub(loop_size); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let eqa1 = vn1.eq(a); | |
let eqb1 = vn1.eq(b); | |
let eqa2 = vn2.eq(a); | |
let eqb2 = vn2.eq(b); | |
let or1 = eqa1.bitor(eqb1); | |
let or2 = eqa2.bitor(eqb2); | |
let or3 = or1.bitor(or2); | |
if or3.any() { | |
let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); | |
if eqb1.any() || eqb2.any() { | |
return Some(at + reverse_pos2(eqb1.bitmask(), eqb2.bitmask())) | |
} | |
at -= VECTOR_SIZE; | |
return Some(at + reverse_pos2(eqa1.bitmask(), eqa2.bitmask())); | |
} | |
} | |
while ptr >= start_ptr.add(VECTOR_SIZE) { | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { | |
return Some(i); | |
} | |
} | |
if ptr > start_ptr { | |
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); | |
return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); | |
} | |
None | |
} | |
pub unsafe fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { | |
let vn1 = i8x16::splat(n1 as i8); | |
let vn2 = i8x16::splat(n2 as i8); | |
let vn3 = i8x16::splat(n3 as i8); | |
let len = haystack.len(); | |
let loop_size = cmp::min(LOOP_SIZE2, len); | |
let start_ptr = haystack.as_ptr(); | |
let end_ptr = haystack[haystack.len()..].as_ptr(); | |
let mut ptr = end_ptr; | |
if haystack.len() < VECTOR_SIZE { | |
while ptr > start_ptr { | |
ptr = ptr.offset(-1); | |
if *ptr == n1 || *ptr == n2 || *ptr == n3 { | |
return Some(sub(ptr, start_ptr)); | |
} | |
} | |
return None; | |
} | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { | |
return Some(i); | |
} | |
ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; | |
debug_assert!(start_ptr <= ptr && ptr <= end_ptr); | |
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { | |
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); | |
ptr = ptr.sub(loop_size); | |
let a = i8x16_load(ptr); | |
let b = i8x16_load(ptr.add(VECTOR_SIZE)); | |
let eqa1 = vn1.eq(a); | |
let eqb1 = vn1.eq(b); | |
let eqa2 = vn2.eq(a); | |
let eqb2 = vn2.eq(b); | |
let eqa3 = vn3.eq(a); | |
let eqb3 = vn3.eq(b); | |
let or1 = eqa1.bitor(eqb1); | |
let or2 = eqa2.bitor(eqb2); | |
let or3 = eqa3.bitor(eqb3); | |
let or4 = or1.bitor(or2); | |
let or5 = or3.bitor(or4); | |
if or5.any() { | |
let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); | |
if eqb1.any() || eqb2.any() || eqb3.any() { | |
return Some(at + reverse_pos3(eqb1.bitmask(), eqb2.bitmask(), eqb3.bitmask())) | |
} | |
at -= VECTOR_SIZE; | |
return Some(at + reverse_pos3(eqa1.bitmask(), eqa2.bitmask(), eqa3.bitmask())); | |
} | |
} | |
while ptr >= start_ptr.add(VECTOR_SIZE) { | |
ptr = ptr.sub(VECTOR_SIZE); | |
if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { | |
return Some(i); | |
} | |
} | |
if ptr > start_ptr { | |
debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); | |
return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); | |
} | |
None | |
} | |
pub unsafe fn forward_search1( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq = vn1.eq(chunk); | |
if eq.any() { | |
Some(sub(ptr, start_ptr) + forward_pos(eq.bitmask())) | |
} else { | |
None | |
} | |
} | |
unsafe fn forward_search2( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
vn2: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq1 = vn1.eq(chunk); | |
let eq2 = vn2.eq(chunk); | |
let or = eq1.bitor(eq2); | |
if or.any() { | |
Some(sub(ptr, start_ptr) + forward_pos2(eq1.bitmask(), eq2.bitmask())) | |
} else { | |
None | |
} | |
} | |
unsafe fn forward_search3( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
vn2: i8x16, | |
vn3: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq1 = vn1.eq(chunk); | |
let eq2 = vn2.eq(chunk); | |
let eq3 = vn3.eq(chunk); | |
let or1 = eq1.bitor(eq2); | |
let or2 = or1.bitor(eq3); | |
if or2.any() { | |
Some(sub(ptr, start_ptr) + forward_pos3(eq1.bitmask(), eq2.bitmask(), eq3.bitmask())) | |
} else { | |
None | |
} | |
} | |
unsafe fn reverse_search1( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq = vn1.eq(chunk); | |
if eq.any() { | |
Some(sub(ptr, start_ptr) + reverse_pos(eq.bitmask())) | |
} else { | |
None | |
} | |
} | |
unsafe fn reverse_search2( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
vn2: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq1 = vn1.eq(chunk); | |
let eq2 = vn2.eq(chunk); | |
let or = eq1.bitor(eq2); | |
if or.any() { | |
Some(sub(ptr, start_ptr) + reverse_pos2(eq1.bitmask(), eq2.bitmask())) | |
} else { | |
None | |
} | |
} | |
unsafe fn reverse_search3( | |
start_ptr: *const u8, | |
end_ptr: *const u8, | |
ptr: *const u8, | |
vn1: i8x16, | |
vn2: i8x16, | |
vn3: i8x16, | |
) -> Option<usize> { | |
debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); | |
debug_assert!(start_ptr <= ptr); | |
debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); | |
let chunk = i8x16_load(ptr); | |
let eq1 = vn1.eq(chunk); | |
let eq2 = vn2.eq(chunk); | |
let eq3 = vn3.eq(chunk); | |
let or1 = eq1.bitor(eq2); | |
let or2 = or1.bitor(eq3); | |
if or2.any() { | |
Some(sub(ptr, start_ptr) + reverse_pos3(eq1.bitmask(), eq2.bitmask(), eq3.bitmask())) | |
} else { | |
None | |
} | |
} | |
/// Compute the position of the first matching byte from the given mask. The | |
/// position returned is always in the range [0, 15]. | |
/// | |
/// The mask given is expected to be the result of _mm_movemask_epi8. | |
fn forward_pos(mask: u16) -> usize { | |
// We are dealing with little endian here, where the most significant byte | |
// is at a higher address. That means the least significant bit that is set | |
// corresponds to the position of our first matching byte. That position | |
// corresponds to the number of zeros after the least significant bit. | |
mask.trailing_zeros() as usize | |
} | |
/// Compute the position of the first matching byte from the given masks. The | |
/// position returned is always in the range [0, 15]. Each mask corresponds to | |
/// the equality comparison of a single byte. | |
/// | |
/// The masks given are expected to be the result of _mm_movemask_epi8, where | |
/// at least one of the masks is non-zero (i.e., indicates a match). | |
fn forward_pos2(mask1: u16, mask2: u16) -> usize { | |
debug_assert!(mask1 != 0 || mask2 != 0); | |
forward_pos(mask1 | mask2) | |
} | |
/// Compute the position of the first matching byte from the given masks. The | |
/// position returned is always in the range [0, 15]. Each mask corresponds to | |
/// the equality comparison of a single byte. | |
/// | |
/// The masks given are expected to be the result of _mm_movemask_epi8, where | |
/// at least one of the masks is non-zero (i.e., indicates a match). | |
fn forward_pos3(mask1: u16, mask2: u16, mask3: u16) -> usize { | |
debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); | |
forward_pos(mask1 | mask2 | mask3) | |
} | |
/// Compute the position of the last matching byte from the given mask. The | |
/// position returned is always in the range [0, 15]. | |
/// | |
/// The mask given is expected to be the result of _mm_movemask_epi8. | |
fn reverse_pos(mask: u16) -> usize { | |
// We are dealing with little endian here, where the most significant byte | |
// is at a higher address. That means the most significant bit that is set | |
// corresponds to the position of our last matching byte. The position from | |
// the end of the mask is therefore the number of leading zeros in a 16 | |
// bit integer, and the position from the start of the mask is therefore | |
// 16 - (leading zeros) - 1. | |
VECTOR_SIZE - mask.leading_zeros() as usize - 1 | |
} | |
/// Compute the position of the last matching byte from the given masks. The | |
/// position returned is always in the range [0, 15]. Each mask corresponds to | |
/// the equality comparison of a single byte. | |
/// | |
/// The masks given are expected to be the result of _mm_movemask_epi8, where | |
/// at least one of the masks is non-zero (i.e., indicates a match). | |
fn reverse_pos2(mask1: u16, mask2: u16) -> usize { | |
debug_assert!(mask1 != 0 || mask2 != 0); | |
reverse_pos(mask1 | mask2) | |
} | |
/// Compute the position of the last matching byte from the given masks. The | |
/// position returned is always in the range [0, 15]. Each mask corresponds to | |
/// the equality comparison of a single byte. | |
/// | |
/// The masks given are expected to be the result of _mm_movemask_epi8, where | |
/// at least one of the masks is non-zero (i.e., indicates a match). | |
fn reverse_pos3(mask1: u16, mask2: u16, mask3: u16) -> usize { | |
debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); | |
reverse_pos(mask1 | mask2 | mask3) | |
} | |
/// Subtract `b` from `a` and return the difference. `a` should be greater than | |
/// or equal to `b`. | |
fn sub(a: *const u8, b: *const u8) -> usize { | |
debug_assert!(a >= b); | |
(a as usize) - (b as usize) | |
} | |
// Load 128 bits from a ptr into i8x16. | |
unsafe fn i8x16_load(ptr: *const u8) -> i8x16 { | |
let slc = slice::from_raw_parts(ptr as *const i8, 16); | |
i8x16::from_slice_unaligned_unchecked(slc) | |
} | |
} // private | |
#[inline(always)] | |
pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memchr(n1, haystack) } | |
} | |
#[inline(always)] | |
pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memchr2(n1, n2, haystack) } | |
} | |
#[inline(always)] | |
pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memchr3(n1, n2, n3, haystack) } | |
} | |
#[inline(always)] | |
pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memrchr(n1, haystack) } | |
} | |
#[inline(always)] | |
pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memrchr2(n1, n2, haystack) } | |
} | |
#[inline(always)] | |
pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { | |
unsafe { private::memrchr3(n1, n2, n3, haystack) } | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment