Created
May 21, 2018 18:10
-
-
Save gottesmm/08b5d69e7bf1dc2a83d75b1b592dec45 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// xcrun clang++ test.cpp -O3 -o - -S -std=c++11 -mavx | |
#include <simd/simd.h> | |
namespace { | |
static constexpr simd_packed_uchar16 zero = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, | |
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, | |
0x0, 0x0, 0x0, 0x0}; | |
} // end anonymous namespace | |
static simd::ushort8 unpackLower(simd::uchar16 input) { | |
return (simd::ushort8)__builtin_shufflevector( | |
input, zero, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); | |
} | |
static simd::ushort8 unpackUpper(simd::uchar16 input) { | |
return (simd::ushort8)__builtin_shufflevector(input, zero, 8, 24, 9, 25, | |
10, 26, 11, 27, 12, 28, | |
13, 29, 14, 30, 15, 31); | |
} | |
static simd::short8 performComparisonLow(simd::uchar16 lhs, simd::ushort8 rhs) { | |
return unpackLower(lhs) != rhs; | |
} | |
static simd::short8 performComparisonHigh(simd::uchar16 lhs, simd::ushort8 rhs) { | |
return unpackUpper(lhs) != rhs; | |
} | |
simd::char16 equalP2(simd::ushort8 *lhs, simd::uchar16 rhs) { | |
return (simd::char16)(performComparisonLow(rhs, lhs[0]) | performComparisonHigh(rhs, lhs[1])); | |
} | |
simd::char16 equalP(simd::ushort8 *lhs, simd::uchar16 rhs) { | |
auto lhsAsUInt8 = ((simd::uchar32 *)lhs)[0]; | |
// lhs == rhs if the low bytes are equal and the high-byte of lhs is zero. | |
return lhsAsUInt8.even == rhs & lhsAsUInt8.odd == 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## // xcrun clang++ test.cpp -O3 -o - -S -std=c++11 -mavx | |
.section __TEXT,__text,regular,pure_instructions | |
.macosx_version_min 10, 13 | |
.globl __Z7equalP2PDv8_tDv16_h ## -- Begin function _Z7equalP2PDv8_tDv16_h | |
.p2align 4, 0x90 | |
__Z7equalP2PDv8_tDv16_h: ## @_Z7equalP2PDv8_tDv16_h | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi0: | |
.cfi_def_cfa_offset 16 | |
Lcfi1: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi2: | |
.cfi_def_cfa_register %rbp | |
vpmovzxbw %xmm0, %xmm1 ## xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | |
vpcmpeqw (%rdi), %xmm1, %xmm1 | |
vpcmpeqd %xmm2, %xmm2, %xmm2 | |
vpxor %xmm2, %xmm1, %xmm1 | |
vpxor %xmm3, %xmm3, %xmm3 | |
vpunpckhbw %xmm3, %xmm0, %xmm0 ## xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] | |
vpcmpeqw 16(%rdi), %xmm0, %xmm0 | |
vpxor %xmm2, %xmm0, %xmm0 | |
vpor %xmm1, %xmm0, %xmm0 | |
popq %rbp | |
retq | |
.cfi_endproc | |
## -- End function | |
.section __TEXT,__literal16,16byte_literals | |
.p2align 4 ## -- Begin function _Z6equalPPDv8_tDv16_h | |
LCPI1_0: | |
.byte 0 ## 0x0 | |
.byte 2 ## 0x2 | |
.byte 4 ## 0x4 | |
.byte 6 ## 0x6 | |
.byte 8 ## 0x8 | |
.byte 10 ## 0xa | |
.byte 12 ## 0xc | |
.byte 14 ## 0xe | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
LCPI1_1: | |
.byte 1 ## 0x1 | |
.byte 3 ## 0x3 | |
.byte 5 ## 0x5 | |
.byte 7 ## 0x7 | |
.byte 9 ## 0x9 | |
.byte 11 ## 0xb | |
.byte 13 ## 0xd | |
.byte 15 ## 0xf | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.space 1 | |
.section __TEXT,__text,regular,pure_instructions | |
.globl __Z6equalPPDv8_tDv16_h | |
.p2align 4, 0x90 | |
__Z6equalPPDv8_tDv16_h: ## @_Z6equalPPDv8_tDv16_h | |
.cfi_startproc | |
## BB#0: | |
pushq %rbp | |
Lcfi3: | |
.cfi_def_cfa_offset 16 | |
Lcfi4: | |
.cfi_offset %rbp, -16 | |
movq %rsp, %rbp | |
Lcfi5: | |
.cfi_def_cfa_register %rbp | |
vmovdqu (%rdi), %ymm1 | |
vmovdqa LCPI1_0(%rip), %xmm2 ## xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> | |
vextractf128 $1, %ymm1, %xmm3 | |
vpshufb %xmm2, %xmm3, %xmm4 | |
vpshufb %xmm2, %xmm1, %xmm2 | |
vpunpcklqdq %xmm4, %xmm2, %xmm2 ## xmm2 = xmm2[0],xmm4[0] | |
vpcmpeqb %xmm0, %xmm2, %xmm0 | |
vmovdqa LCPI1_1(%rip), %xmm2 ## xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> | |
vpshufb %xmm2, %xmm3, %xmm3 | |
vpshufb %xmm2, %xmm1, %xmm1 | |
vpunpcklqdq %xmm3, %xmm1, %xmm1 ## xmm1 = xmm1[0],xmm3[0] | |
vpxor %xmm2, %xmm2, %xmm2 | |
vpcmpeqb %xmm2, %xmm1, %xmm1 | |
vpand %xmm0, %xmm1, %xmm0 | |
popq %rbp | |
vzeroupper | |
retq | |
.cfi_endproc | |
## -- End function | |
.subsections_via_symbols |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment