Created
July 17, 2024 13:41
-
-
Save hnakamur/12f88264c8dcdc611231e0b755269ba5 to your computer and use it in GitHub Desktop.
SIMD experiment with Odin
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import "core:fmt" | |
import "core:simd" | |
main :: proc() { | |
i := index_any([]u8{'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '\r', '&', 'a'}, 16) | |
fmt.println(i) | |
} | |
ch1: u8 = '&' | |
ch2: u8 = '\r' | |
lane :: 16 | |
needle1 := simd.from_array([lane]u8{ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1, ch1,}) | |
needle2 := simd.from_array([lane]u8{ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2, ch2,}) | |
index_any :: proc(haystack: []u8, len: int) -> int { | |
offset: int = 0 | |
for offset + lane <= len { | |
v := simd.from_slice(simd.u8x16, haystack[offset:offset+lane]) | |
eq1 := simd.lanes_eq(v, needle1) | |
eq2 := simd.lanes_eq(v, needle2) | |
if simd.reduce_or(eq1) == 0xff || simd.reduce_or(eq2) == 0xff { | |
break | |
} | |
offset += lane | |
} | |
for i in offset..<len { | |
if haystack[i] == ch1 || haystack[i] == ch2 { | |
return i | |
} | |
} | |
return -1 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ...(snip)... | |
.p2align 4, 0x90 | |
.type main.index_any,@function | |
main.index_any: | |
.cfi_startproc | |
subq $216, %rsp | |
.cfi_def_cfa_offset 224 | |
movq %rcx, 40(%rsp) | |
movq %rdx, 48(%rsp) | |
movq %rsi, 56(%rsp) | |
movq %rdi, 64(%rsp) | |
movq 48(%rsp), %rax | |
movq 56(%rsp), %rcx | |
movq 64(%rsp), %rdx | |
movq %rdx, 200(%rsp) | |
movq %rcx, 208(%rsp) | |
movq %rax, 192(%rsp) | |
movq $0, 184(%rsp) | |
.LBB28_2: | |
movq 48(%rsp), %rcx | |
movq 184(%rsp), %rax | |
addq $16, %rax | |
cmpq %rcx, %rax | |
setle %al | |
andb $1, %al | |
cmpb $0, %al | |
je .LBB28_7 | |
movq 184(%rsp), %r8 | |
movq %r8, 32(%rsp) | |
movq %r8, %r9 | |
addq $16, %r9 | |
movq 208(%rsp), %rcx | |
movq %rsp, %rax | |
movq %rcx, (%rax) | |
movl $.Lcsbs$da, %edi | |
movl $42, %esi | |
movl $20, %edx | |
movl $50, %ecx | |
callq runtime.slice_expr_error_lo_hi | |
movq 32(%rsp), %rcx | |
movq 40(%rsp), %rdx | |
movq 200(%rsp), %rax | |
addq %rcx, %rax | |
movq %rax, 168(%rsp) | |
movq $16, 176(%rsp) | |
movq 168(%rsp), %rdi | |
movq 176(%rsp), %rsi | |
callq "simd.from_slice-19570" | |
movsd %xmm1, 152(%rsp) | |
movq %xmm0, 144(%rsp) | |
movaps 144(%rsp), %xmm0 | |
movaps %xmm0, 128(%rsp) | |
movaps 128(%rsp), %xmm0 | |
movaps main.needle1(%rip), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
movaps %xmm0, 112(%rsp) | |
movaps 128(%rsp), %xmm0 | |
movaps main.needle2(%rip), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
movaps %xmm0, 96(%rsp) | |
movaps 112(%rsp), %xmm0 | |
pshufd $238, %xmm0, %xmm1 | |
por %xmm1, %xmm0 | |
pshufd $85, %xmm0, %xmm1 | |
por %xmm1, %xmm0 | |
movaps %xmm0, %xmm1 | |
psrld $16, %xmm1 | |
por %xmm1, %xmm0 | |
movaps %xmm0, %xmm1 | |
psrlw $8, %xmm1 | |
por %xmm1, %xmm0 | |
movd %xmm0, %eax | |
cmpb $-1, %al | |
sete %al | |
andb $1, %al | |
cmpb $0, %al | |
jne .LBB28_5 | |
movaps 96(%rsp), %xmm0 | |
pshufd $238, %xmm0, %xmm1 | |
por %xmm1, %xmm0 | |
pshufd $85, %xmm0, %xmm1 | |
por %xmm1, %xmm0 | |
movaps %xmm0, %xmm1 | |
psrld $16, %xmm1 | |
por %xmm1, %xmm0 | |
movaps %xmm0, %xmm1 | |
psrlw $8, %xmm1 | |
por %xmm1, %xmm0 | |
movd %xmm0, %eax | |
cmpb $-1, %al | |
sete %al | |
andb $1, %al | |
cmpb $0, %al | |
je .LBB28_6 | |
.LBB28_5: | |
jmp .LBB28_7 | |
.LBB28_6: | |
movq 184(%rsp), %rax | |
addq $16, %rax | |
movq %rax, 184(%rsp) | |
jmp .LBB28_2 | |
.LBB28_7: | |
movq 184(%rsp), %rax | |
movq %rax, 88(%rsp) | |
movq $0, 80(%rsp) | |
.LBB28_8: | |
movq 48(%rsp), %rax | |
cmpq %rax, 88(%rsp) | |
jge .LBB28_14 | |
movq 56(%rsp), %r9 | |
movq 88(%rsp), %rax | |
movq %rax, 72(%rsp) | |
movq 72(%rsp), %r8 | |
movq %r8, 24(%rsp) | |
movl $.Lcsbs$da, %edi | |
movl $42, %esi | |
movl $32, %edx | |
movl $21, %ecx | |
callq runtime.bounds_check_error | |
movq 24(%rsp), %rcx | |
movq 64(%rsp), %rax | |
movb (%rax,%rcx), %al | |
cmpb main.ch1, %al | |
sete %al | |
andb $1, %al | |
cmpb $0, %al | |
jne .LBB28_11 | |
movq 56(%rsp), %r9 | |
movq 72(%rsp), %r8 | |
movq %r8, 16(%rsp) | |
movl $.Lcsbs$da, %edi | |
movl $42, %esi | |
movl $32, %edx | |
movl $43, %ecx | |
callq runtime.bounds_check_error | |
movq 16(%rsp), %rcx | |
movq 64(%rsp), %rax | |
movb (%rax,%rcx), %al | |
cmpb main.ch2, %al | |
sete %al | |
andb $1, %al | |
cmpb $0, %al | |
je .LBB28_12 | |
.LBB28_11: | |
movq 72(%rsp), %rax | |
addq $216, %rsp | |
.cfi_def_cfa_offset 8 | |
retq | |
.LBB28_12: | |
.cfi_def_cfa_offset 224 | |
jmp .LBB28_13 | |
.LBB28_13: | |
movq 88(%rsp), %rax | |
addq $1, %rax | |
movq %rax, 88(%rsp) | |
movq 80(%rsp), %rax | |
addq $1, %rax | |
movq %rax, 80(%rsp) | |
jmp .LBB28_8 | |
.LBB28_14: | |
movq $-1, %rax | |
addq $216, %rsp | |
.cfi_def_cfa_offset 8 | |
retq | |
.Lfunc_end28: | |
.size main.index_any, .Lfunc_end28-main.index_any | |
.cfi_endproc | |
# ...(snip)... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
My attempt to port the example at https://x.com/orisano/status/1813187886910697632 to Odin.
odin-simd-experiment.S is build with the following command: