Last active
August 4, 2020 16:00
-
-
Save clausecker/3de220b146f51e5f7bc9f4500c62847f to your computer and use it in GitHub Desktop.
8 bit positional popcount with AVX2 without too much code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _XOPEN_SOURCE 700 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
extern void pospopcnt_reg(int accum[8], const char *buf, size_t len); | |
extern void pospopcnt_mem(int accum[8], const char *buf, size_t len); | |
extern void | |
pospopcnt_naive(int accum[8], const char *buf, size_t len) | |
{ | |
size_t i; | |
int j; | |
for (i = 0; i < len; i++) | |
for (j = 0; j < 8; j++) | |
accum[j] += !!(buf[i] & 1 << j); | |
} | |
/* | |
* Compute the difference of two struct timespec. | |
*/ | |
static struct timespec | |
tsdiff(struct timespec a, struct timespec b) | |
{ | |
a.tv_sec -= b.tv_sec; | |
a.tv_nsec -= b.tv_nsec; | |
if (a.tv_nsec < 0) { | |
a.tv_sec -= 1; | |
a.tv_nsec += 1000000000; | |
} | |
return (a); | |
} | |
/* perform a benchmark */ | |
static void benchmark(const char *buf, size_t len, const char *name, void (*pospopcnt)(int[8], const char *, size_t)) | |
{ | |
struct timespec diff, start, end; | |
double dur; | |
long i, n = 1; | |
int accum[8] = {0}; | |
do { | |
clock_gettime(CLOCK_REALTIME, &start); | |
for (i = 0; i < n; i++) | |
pospopcnt(accum, buf, len); | |
clock_gettime(CLOCK_REALTIME, &end); | |
diff = tsdiff(end, start); | |
n <<= 1; | |
} while (diff.tv_sec == 0); | |
n >>= 1; | |
dur = diff.tv_sec + diff.tv_nsec / 1000000000.0; | |
dur /= n; | |
printf("%s\t%g B/s\n", name, len / dur); | |
} | |
extern int | |
main(int argc, char *argv[]) | |
{ | |
size_t len = 8192; | |
int naive_accum[8], asm_accum[8]; | |
FILE *random; | |
char *buf; | |
if (argc > 1) | |
len = atoll(argv[1]) + 31 & ~31LL; | |
buf = malloc(len); | |
if (buf == NULL) { | |
perror("malloc"); | |
return (EXIT_FAILURE); | |
} | |
random = fopen("/dev/urandom", "rb"); | |
if (random == NULL) { | |
perror("/dev/urandom"); | |
return (EXIT_FAILURE); | |
} | |
fread(buf, 1, len, random); | |
memset(naive_accum, 0, sizeof naive_accum); | |
memset(asm_accum, 0, sizeof asm_accum); | |
pospopcnt_reg(asm_accum, buf, len); | |
pospopcnt_naive(naive_accum, buf, len); | |
if (memcmp(asm_accum, naive_accum, 8*4) != 0) | |
printf("mismatch\n"); | |
benchmark(buf, len, "naive", pospopcnt_naive); | |
benchmark(buf, len, "addmem", pospopcnt_mem); | |
benchmark(buf, len, "addreg", pospopcnt_reg); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pospopcnt(accum, buf, len) | |
# len must be a multiple of 32 | |
.globl pospopcnt_mem | |
.type pospopcnt_mem, @function | |
pospopcnt_mem: | |
# rdi: accum | |
# rsi: buf | |
# rdx: len | |
test %rdx, %rdx # any work to do? | |
jnz 0f | |
ret | |
.align 16 | |
0: vmovdqu (%rsi), %ymm0 | |
add $32, %rsi | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 7*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 6*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 5*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 4*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 3*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 2*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 1*4(%rdi) | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, 0*4(%rdi) | |
sub $32, %rdx | |
jnz 0b | |
vzeroupper | |
ret | |
.globl pospopcnt_reg | |
.type pospopcnt_reg, @function | |
pospopcnt_reg: | |
# rdi: accum | |
# rsi: buf | |
# rdx: len | |
test %rdx, %rdx # any work to do? | |
jnz 1f | |
ret | |
1: push %rbp | |
push %rbx | |
push %r12 | |
mov 0*4(%rdi), %r8d | |
mov 1*4(%rdi), %r9d | |
mov 2*4(%rdi), %r10d | |
mov 3*4(%rdi), %r11d | |
mov 4*4(%rdi), %r12d | |
mov 5*4(%rdi), %ebx | |
mov 6*4(%rdi), %ecx | |
mov 7*4(%rdi), %ebp | |
.align 16 | |
0: vmovdqu (%rsi), %ymm0 | |
add $32, %rsi | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %ebp | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %ecx | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %ebx | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %r12d | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %r11d | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %r10d | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %r9d | |
vpaddd %ymm0, %ymm0, %ymm0 | |
vpmovmskb %ymm0, %eax | |
popcnt %eax, %eax | |
add %eax, %r8d | |
sub $32, %rdx | |
jnz 0b | |
mov %r8d, 0*4(%rdi) | |
mov %r9d, 1*4(%rdi) | |
mov %r10d, 2*4(%rdi) | |
mov %r11d, 3*4(%rdi) | |
mov %r12d, 4*4(%rdi) | |
mov %ebx, 5*4(%rdi) | |
mov %ecx, 6*4(%rdi) | |
mov %ebp, 7*4(%rdi) | |
vzeroupper | |
pop %r12 | |
pop %rbx | |
pop %rbp | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment