Created
March 2, 2021 03:21
-
-
Save alexey-milovidov/2c3b4e532b11bfb1708c89bccc7d6576 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
memcpy.S: | |
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ | |
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ | |
╞══════════════════════════════════════════════════════════════════════════════╡ | |
│ Copyright 2020 Justine Alexandra Roberts Tunney │ | |
│ │ | |
│ Permission to use, copy, modify, and/or distribute this software for │ | |
│ any purpose with or without fee is hereby granted, provided that the │ | |
│ above copyright notice and this permission notice appear in all copies. │ | |
│ │ | |
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ | |
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ | |
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ | |
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ | |
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ | |
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ | |
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ | |
│ PERFORMANCE OF THIS SOFTWARE. │ | |
╚──────────────────────────────────────────────────────────────────────────────╝ | |
@fileoverview Cosmopolitan Memory Copying | |
Of all the functions in the technology industry, none are more | |
critical than the Kernighan & Ritchie Memory Copy API for the C | |
Language, 1972 model: more commonly known as memcpy(). It's the | |
world's most popular function──one all programmers love. | |
This implementation is the fastest and nearly the tiniest too. | |
It doesn't break when copying backwards or on misaligned data. | |
It's so easy that even a child could use it, and they do. | |
*/ | |
#include "libc/nexgen32e/x86feature.h" | |
// Ends function definition. | |
// @cost saves 1-3 lines of code | |
.macro .endfn name:req bnd vis | |
.size \name,.-\name | |
.type \name,@function | |
.ifnb \bnd | |
.\bnd \name | |
.endif | |
.ifnb \vis | |
.\vis \name | |
.endif | |
.endm | |
// Ends variable definition. | |
// @cost saves 1-3 lines of code | |
.macro .endobj name:req bnd vis | |
.size \name,.-\name | |
.type \name,@object | |
.ifnb \bnd | |
.\bnd \name | |
.endif | |
.ifnb \vis | |
.\vis \name | |
.endif | |
.endm | |
// Helpers for Cosmopolitan _init() amalgamation magic. | |
// @param name should be consistent across macros for a module | |
// @see libc/runtime/_init.S | |
.macro .initro number:req name:req | |
.section .initro.\number\().\name,"a",@progbits | |
.align 8 | |
.endm | |
.macro .initbss number:req name:req | |
.section .piro.bss.init.2.\number\().\name,"aw",@nobits | |
.align 8 | |
.endm | |
.macro .init.start number:req name:req | |
.section .init.\number\().\name,"ax",@progbits | |
\name: | |
.endm | |
.macro .init.end number:req name:req bnd=globl vis | |
.endfn \name,\bnd,\vis | |
.previous | |
.endm | |
// LOOP Instruction Replacement. | |
// With its mop-Fusion Mexican equivalent. | |
// Thus avoiding 3x legacy pipeline slowdown. | |
.macro .loop label:req | |
.byte 0x83,0xe9,0x01 # sub $1,%ecx | |
jnz \label | |
.endm | |
// TODO(jart): delete | |
// Loads Effective Address | |
// Supporting security blankets | |
.macro ezlea symbol:req reg:req | |
#if __pic__ + __pie__ + __code_model_medium__ + __code_model_large__ + 0 > 1 | |
// lea \symbol(%rip),%r\reg | |
mov $\symbol,%e\reg | |
#else | |
mov $\symbol,%e\reg | |
#endif | |
.endm | |
// Copies memory. | |
// | |
// DEST and SRC must not overlap, unless DEST≤SRC. | |
// | |
// @param rdi is dest | |
// @param rsi is src | |
// @param rdx is number of bytes | |
// @return original rdi copied to rax | |
// @mode long | |
// @asyncsignalsafe | |
memcpy: | |
/// memcpy function returns pointer to dst (its first argument), it's required by standard but rarely needed. | |
/// That's why we also provide a function MemCpy that returns nothing and is one instruction less. | |
mov %rdi, %rax | |
// 𝑠𝑙𝑖𝑑𝑒 | |
.align 16 | |
.endfn memcpy,globl | |
// Copies memory with minimal impact ABI. | |
// | |
// @param rdi is dest | |
// @param rsi is src | |
// @param rdx is number of bytes | |
// @clob flags,rcx,xmm3,xmm4 | |
// @mode long | |
MemCpy: | |
push %rbp | |
mov %rsp, %rbp | |
mov $.Lmemcpytab_ro.size, %ecx | |
/// rcx - the size of the static part of memcpytab_ro (a small value) | |
/// rdx - the number of bytes to copy | |
/// If the number of bytes is smaller than the size of static table - jump to memcpytab[size * 8] | |
/// If it is larger - jump to the end of memcpytab | |
cmp %rcx, %rdx | |
cmovb %rdx, %rcx | |
jmp *memcpytab(,%rcx,8) | |
/// This label is needed only to calculate differencies in the memcpytab_ro | |
.Lanchorpoint: | |
/// For large sizes we will branch to ERMS and Non-Temporary stores implementations. | |
.L32r: | |
cmp $1024,%rdx | |
jae .Lerms | |
.L32: | |
/// We will save the latest 32 bytes before the loop | |
vmovdqu -32(%rsi,%rdx),%ymm4 | |
/// rcx will be the number of bytes processed so far | |
mov $32,%rcx | |
/// The main AVX loop: copy by 32 byte portions from the beginning up to the latest 32 bytes. | |
0: | |
add $32,%rcx | |
vmovdqu -64(%rsi,%rcx),%ymm3 | |
vmovdqu %ymm3,-64(%rdi,%rcx) | |
cmp %rcx,%rdx | |
ja 0b | |
/// Now copy the latest 32 bytes. They can overlap previously copied 32 bytes if the size is not a multiple of 32. | |
/// Example: when copying 33 bytes, the sequence is the following: | |
/// 1. Save the latest 32 bytes. | |
/// 2. Copy first 32 bytes. | |
/// 3. Store the latest 32 bytes. | |
vmovdqu %ymm4,-32(%rdi,%rdx) | |
vxorps %ymm4,%ymm4,%ymm4 | |
vxorps %ymm3,%ymm3,%ymm3 | |
jmp .L0 | |
/// For large sizes we will branch to ERMS and Non-Temporary stores implementations. | |
.L16r: | |
cmp $1024,%rdx | |
jae .Lerms | |
/// This is very similar to AVX but for SSE | |
.L16: | |
movdqu -16(%rsi,%rdx),%xmm4 | |
mov $16,%rcx | |
0: | |
add $16,%rcx | |
movdqu -32(%rsi,%rcx),%xmm3 | |
movdqu %xmm3,-32(%rdi,%rcx) | |
cmp %rcx,%rdx | |
ja 0b | |
movdqu %xmm4,-16(%rdi,%rdx) | |
pxor %xmm4,%xmm4 | |
pxor %xmm3,%xmm3 | |
jmp .L0 | |
/// 9..16 remaining bytes. | |
.L8: | |
/// We will use rbx, so save the previous value on stack. | |
push %rbx | |
/// Copy them by two halves to allow overlapping regions. | |
mov (%rsi),%rcx | |
mov -8(%rsi,%rdx),%rbx | |
mov %rcx,(%rdi) | |
mov %rbx,-8(%rdi,%rdx) | |
/// Restore the value of rbx and return | |
1: | |
pop %rbx | |
/// Zero bytes remaining, exit from function | |
.L0: | |
pop %rbp | |
ret | |
/// 5..8 remaining bytes | |
.L4: | |
/// We will use rbx, so save the previous value on stack. | |
push %rbx | |
/// The current four bytes | |
mov (%rsi),%ecx | |
/// The latest four bytes | |
mov -4(%rsi,%rdx),%ebx | |
mov %ecx,(%rdi) | |
mov %ebx,-4(%rdi,%rdx) | |
/// Restore the value of rbx and return | |
jmp 1b | |
/// 3..4 remaining bytes | |
.L3: | |
/// We will use rbx, so save the previous value on stack. | |
push %rbx | |
/// The current two bytes | |
mov (%rsi),%cx | |
/// The latest two bytes | |
mov -2(%rsi,%rdx),%bx | |
mov %cx,(%rdi) | |
mov %bx,-2(%rdi,%rdx) | |
/// Restore the value of rbx and return | |
jmp 1b | |
/// Two remaining bytes | |
.L2: | |
mov (%rsi),%cx | |
mov %cx,(%rdi) | |
jmp .L0 | |
/// One remaining byte | |
.L1: | |
mov (%rsi),%cl | |
mov %cl,(%rdi) | |
jmp .L0 | |
/// The case when CPU has "erms" flag. It means "Enhanced REP MOVSB/STOSB", | |
/// and literally: "rep movsb" is a good way for memcpy. | |
/// But in fact it's profitable only for sizes larger than 1024 bytes but less than half of L3 cache size. | |
.Lerms: | |
/// For large size, go to Non-Temporary stores implementation. | |
cmp kHalfCache3(%rip),%rdx | |
ja .Lnts | |
/// "rep movsb" will clobber these registers, but our function should not. Save them and restore before return. | |
push %rdi | |
push %rsi | |
/// This is where "rep movsb" expect to have its counter. | |
mov %rdx,%rcx | |
rep movsb | |
pop %rsi | |
pop %rdi | |
/// Return | |
jmp .L0 | |
/// The case for non-temporary stores. | |
/// Non-temporary stores are operations that bypass CPU cache (store directly to memory). | |
/// They are profitable if size is larger than half of L3 cache size. | |
/// NOTE: when doing memcpy from multiple threads, most likely it will be better to lower | |
/// the threshold, because the L3 cache is shared among cores. | |
.Lnts: | |
/// Copy the first 16 bytes. | |
movdqu (%rsi),%xmm3 | |
movdqu %xmm3,(%rdi) | |
/// Align the address of dest + 16 down to 16 bytes boundary and store to rcx. | |
lea 16(%rdi),%rcx | |
and $-16,%rcx | |
/// Align both source and destination and set size to the number of remaining bytes | |
sub %rdi,%rcx | |
add %rcx,%rdi | |
add %rcx,%rsi | |
sub %rcx,%rdx | |
/// rcx will be the number of bytes processed so far | |
mov $16,%rcx | |
/// Main loop | |
0: | |
add $16,%rcx | |
movdqu -32(%rsi,%rcx),%xmm3 | |
movntdq %xmm3,-32(%rdi,%rcx) | |
cmp %rcx,%rdx | |
ja 0b | |
/// Copy the latest 16 bytes (that may overlap already copied bytes). | |
/// We need to make the previous stores visible for this operation. | |
sfence | |
movdqu -16(%rsi,%rdx),%xmm3 | |
movdqu %xmm3,-16(%rdi,%rdx) | |
pxor %xmm3,%xmm3 | |
jmp .L0 | |
.endfn MemCpy,globl,hidden | |
.initro 300,_init_memcpy | |
memcpytab_ro: | |
/// This table is indexed by the number of remaining bytes. | |
.byte .L0-.Lanchorpoint | |
.byte .L1-.Lanchorpoint | |
.byte .L2-.Lanchorpoint | |
.byte .L3-.Lanchorpoint | |
.byte .L4-.Lanchorpoint | |
.byte .L4-.Lanchorpoint | |
.byte .L4-.Lanchorpoint | |
.byte .L4-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint | |
.byte .L8-.Lanchorpoint /// For example, 15 bytes can be copied by two 8-byte unaligned load/stores. | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.byte .L16-.Lanchorpoint | |
.equ .Lmemcpytab_ro.size,.-memcpytab_ro | |
.endobj memcpytab_ro | |
.if .Lmemcpytab_ro.size % 8 | |
.error "moar jmptab" | |
.endif | |
.byte .L16-.Lanchorpoint # SSE2 | |
.byte .L16r-.Lanchorpoint # SSE2 + ERMS | |
.byte .L32-.Lanchorpoint # AVX | |
.byte .L32r-.Lanchorpoint # AVX + ERMS | |
.byte 0,0,0,0 | |
.endobj memcpytab_ro, globl | |
.previous /// This means: "undo" section declaration: https://stackoverflow.com/questions/2416879/what-does-asm-previous-mean | |
/// It is filled at startup with pointers according to the offsets in memcpytab_ro | |
.initbss 300,_init_memcpy | |
memcpytab: | |
.rept .Lmemcpytab_ro.size | |
.quad 0 /// Pointers for small sizes. | |
.endr | |
.quad 0 /// A pointer to the label for large size. | |
.endobj memcpytab, globl | |
.previous | |
.init.start 300,_init_memcpy | |
ezlea .Lmemcpytab_ro.size, cx | |
ezlea .Lanchorpoint, dx | |
testb X86_HAVE(AVX) + kCpuids(%rip) | |
call memjmpinit | |
ret | |
.init.end 300,_init_memcpy | |
// Initializes jump table for memset() and memcpy(). | |
// | |
// @param !ZF if required cpu vector extensions are available | |
// @param rdi is address of 64-bit jump table | |
// @param rsi is address of 8-bit jump initializers | |
// @param rdx is address of indirect branch (Anchor point) | |
// @param ecx is size of jump table | |
memjmpinit: | |
push %rbp | |
mov %rsp, %rbp | |
setnz %r8b | |
shl %r8b | |
/// Transform memcpytab_ro with byte offsets to pointers. | |
0: xor %eax,%eax | |
lodsb | |
add %rdx,%rax | |
stosq | |
.loop 0b | |
xor %eax,%eax | |
testb X86_HAVE(ERMS) + kCpuids(%rip) | |
setnz %al | |
or %r8b,%al | |
mov (%rsi,%rax),%al | |
add %rdx,%rax | |
stosq | |
lodsq | |
pop %rbp | |
ret | |
.endfn memjmpinit,globl,hidden | |
// Globally precomputed CPUID. | |
// | |
// This module lets us check CPUID in 0.06ns rather than 51.00ns. | |
// If every piece of native software linked this module, then the | |
// world would be a much better place; since all the alternatives | |
// are quite toilsome. | |
// | |
// @see www.felixcloutier.com/x86/cpuid | |
.initbss 201,_init_kCpuids | |
kCpuids: | |
.long 0,0,0,0 # EAX=0 (Basic Processor Info) | |
.long 0,0,0,0 # EAX=1 (Processor Info) | |
.long 0,0,0,0 # EAX=2 | |
.long 0,0,0,0 # EAX=7 (Extended Features) | |
.long 0,0,0,0 # EAX=0x80000001 (NexGen32e) | |
.long 0,0,0,0 # EAX=0x80000007 (APM) | |
.long 0,0,0,0 # EAX=16h (CPU Frequency) | |
.endobj kCpuids,globl | |
.previous | |
.init.start 201,_init_kCpuids | |
push %rbx | |
push $0 | |
push $0x16 | |
push $0xffffffff80000007 | |
push $0xffffffff80000001 | |
push $7 | |
push $2 | |
push $1 | |
mov %rdi,%r8 | |
xor %eax,%eax | |
1: xor %ecx,%ecx | |
cpuid | |
stosl | |
xchg %eax,%ebx | |
stosl | |
xchg %eax,%ecx | |
stosl | |
xchg %eax,%edx | |
stosl | |
2: pop %rax | |
test %eax,%eax # EAX = stacklist->pop() | |
jz 3f # EAX ≠ 0 (EOL sentinel) | |
cmp KCPUIDS(0H,EAX)(%r8),%al # EAX ≤ CPUID.0 max leaf | |
jbe 1b # CPUID too new to probe | |
add $4*4,%rdi | |
jmp 2b | |
3: nop | |
/* | |
#if !X86_NEED(AVX2) | |
testb X86_HAVE(AVX)(%r8) | |
jz 5f | |
testb X86_HAVE(OSXSAVE)(%r8) | |
jz 4f | |
xor %ecx,%ecx | |
xgetbv | |
and $XCR0_SSE|XCR0_AVX,%eax | |
cmp $XCR0_SSE|XCR0_AVX,%eax | |
je 5f | |
4: btr $X86_BIT(AVX),X86_WORD(AVX)(%r8) | |
btr $X86_BIT(AVX2),X86_WORD(AVX2)(%r8) | |
#endif*/ | |
5: pop %rbx | |
retq | |
.init.end 201,_init_kCpuids | |
.initbss 202,_init_kHalfCache3 | |
// Half size of level 3 cache in bytes. | |
kHalfCache3: | |
.quad 0 | |
.endobj kHalfCache3,globl | |
.previous | |
.init.start 202,_init_kHalfCache3 | |
cmpl $3,kCpuids(%rip) | |
jbe 3f | |
xor %r8d,%r8d | |
mov $4,%r8d | |
1: mov %r8d,%eax | |
mov %r8d,%ecx | |
push %rbx | |
cpuid | |
mov %ebx,%r9d | |
pop %rbx | |
test $31,%al | |
je 3f | |
cmp $99,%al | |
jne 2f | |
mov %r9d,%eax | |
mov %r9d,%edx | |
inc %ecx | |
shr $12,%r9d | |
shr $22,%eax | |
and $0x0fff,%edx | |
and $0x03ff,%r9d | |
inc %eax | |
inc %edx | |
imul %edx,%eax | |
imul %ecx,%eax | |
lea 1(%r9),%ecx | |
imul %ecx,%eax | |
jmp 4f | |
2: inc %r8d | |
jmp 1b | |
3: mov $0x00400000,%eax | |
4: shr %eax | |
stosq | |
retq | |
.init.end 202,_init_kHalfCache3 | |
/* your memcpy() 375 bytes | |
bionic memcpy() 1,429 bytes | |
glibc memcpy() 27,216 bytes | |
musl memcpy() 49 bytes | |
newlib memcpy() 300 bytes | |
benchmarks on intel core i7-6700 @ 3.40GHz (skylake) | |
includes function call overhead (unless marked otherwise) | |
your memcpy(𝑛) for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 297.000 35.125 35.203 92 | |
1 35.000 35.625 35.016 93 | |
2 27.500 17.438 17.555 185 | |
3 21.000 11.875 12.057 270 | |
4 16.250 8.719 8.809 369 | |
7 5.000 4.946 5.069 641 | |
8 7.375 4.422 4.365 745 | |
15 4.067 2.342 2.336 1391 | |
16 4.188 2.242 2.257 1440 « | |
31 8.032 1.157 1.147 2835 | |
32 2.031 1.723 1.325 2454 | |
63 1.000 0.589 0.589 5523 | |
64 0.578 0.580 0.577 5630 « | |
127 0.638 0.377 0.320 10151 | |
128 0.289 0.296 0.307 10605 | |
255 0.404 0.202 0.194 16741 | |
256 0.160 0.165 0.166 19574 « | |
511 0.159 0.123 0.110 29458 | |
512 0.139 0.098 0.097 33571 « | |
1023 0.107 0.086 0.074 44111 | |
1024 0.103 0.084 0.082 39489 | |
2047 0.057 0.056 0.057 57450 | |
2048 0.055 0.055 0.055 59269 | |
4095 0.044 0.044 0.044 74051 | |
4096 0.043 0.043 0.043 75300 « | |
8191 0.036 0.036 0.036 91301 | |
8192 0.036 0.035 0.035 92411 | |
16383 0.033 0.032 0.032 102163 | |
16384 0.034 0.032 0.032 102145 « (L1)/2 | |
32767 0.098 0.081 0.077 42271 | |
32768 0.077 0.077 0.076 42781 | |
65535 0.088 0.075 0.072 44973 | |
65536 0.074 0.072 0.071 45520 | |
131071 0.086 0.075 0.072 44869 | |
131072 0.077 0.073 0.072 45076 « (L2)/2 | |
262143 0.095 0.096 0.095 34116 | |
262144 0.096 0.096 0.095 34160 | |
524287 0.102 0.109 0.111 29359 | |
524288 0.107 0.109 0.108 30033 | |
1048575 0.102 0.103 0.104 31112 | |
1048576 0.101 0.103 0.103 31605 | |
2097151 0.104 0.103 0.109 29929 | |
2097152 0.108 0.110 0.103 31652 | |
4194303 0.192 0.172 0.172 18950 | |
4194304 0.168 0.161 0.160 20311 « (L3)/2 | |
8388607 0.339 0.329 0.344 9461 « RAM | |
8388608 0.384 0.369 0.341 9545 | |
Bionic memcpy() for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 347.000 40.625 35.984 90 | |
1 37.000 35.625 36.734 89 | |
2 28.500 18.688 18.383 177 | |
3 11.667 12.375 12.359 263 | |
4 12.250 9.406 9.020 361 | |
7 5.000 5.018 5.118 636 | |
8 11.625 5.828 4.779 681 | |
15 3.533 3.158 2.620 1243 | |
16 4.688 2.742 2.884 1129 « | |
31 1.903 1.262 1.172 2778 | |
32 1.344 1.113 1.125 2895 | |
63 1.444 0.633 0.591 5513 | |
64 0.766 0.580 0.581 5605 « | |
127 0.512 0.383 0.318 10229 | |
128 0.461 0.315 0.311 10463 | |
255 0.475 0.216 0.193 16840 | |
256 0.371 0.236 0.199 16397 « | |
511 0.295 0.144 0.120 27223 | |
512 0.240 0.151 0.126 25937 « | |
1023 0.142 0.101 0.088 36947 | |
1024 0.126 0.108 0.091 35889 | |
2047 0.088 0.074 0.072 45475 | |
2048 0.089 0.077 0.073 44380 | |
4095 0.081 0.065 0.064 50766 | |
4096 0.068 0.066 0.065 50246 « | |
8191 0.063 0.061 0.060 54075 | |
8192 0.065 0.061 0.061 53731 | |
16383 0.082 0.066 0.061 53765 | |
16384 0.067 0.063 0.062 52765 « (L1)/2 | |
32767 0.102 0.085 0.085 38406 | |
32768 0.086 0.085 0.085 38473 | |
65535 0.098 0.085 0.085 38292 | |
65536 0.086 0.085 0.085 38369 | |
131071 0.438 0.177 0.089 36716 | |
131072 0.092 0.090 0.093 34880 « (L2)/2 | |
262143 0.306 0.146 0.127 25601 | |
262144 0.126 0.168 0.127 25704 | |
524287 0.213 0.152 0.136 23993 | |
524288 0.132 0.159 0.133 24570 | |
1048575 0.127 0.129 0.130 25117 | |
1048576 0.128 0.129 0.130 25107 | |
2097151 0.127 0.127 0.129 25199 | |
2097152 0.127 0.136 0.134 24274 | |
4194303 0.216 0.192 0.228 14237 | |
4194304 0.351 0.351 0.356 9139 « (L3)/2 | |
8388607 0.323 0.293 0.298 10903 « RAM | |
8388608 0.365 0.296 0.300 10844 | |
GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 53.000 50.625 50.453 64 | |
1 47.000 49.375 49.141 66 | |
2 23.500 25.062 24.898 131 | |
3 15.667 16.792 16.880 193 | |
4 11.750 12.531 12.957 251 | |
7 7.000 7.125 7.190 452 | |
8 6.125 7.578 6.322 514 | |
15 3.133 3.325 3.372 964 | |
16 3.062 3.117 3.132 1038 « | |
31 1.645 1.601 1.620 2007 | |
32 1.531 1.559 1.585 2051 | |
63 0.778 0.796 0.802 4056 | |
64 0.766 0.768 0.767 4238 « | |
127 0.480 0.446 0.448 7259 | |
128 0.445 0.419 0.423 7693 | |
255 0.239 0.239 0.236 13781 | |
256 0.238 0.225 0.225 14466 « | |
511 0.127 0.133 0.132 24555 | |
512 0.123 0.127 0.128 25377 « | |
1023 0.079 0.081 0.081 40346 | |
1024 0.075 0.077 0.078 41714 | |
2047 0.053 0.055 0.055 59575 | |
2048 0.053 0.053 0.053 60795 | |
4095 0.042 0.043 0.043 75843 | |
4096 0.042 0.042 0.042 77153 | |
8191 0.035 0.036 0.036 91518 | |
8192 0.035 0.035 0.035 92603 | |
16383 0.032 0.032 0.032 102407 | |
16384 0.033 0.032 0.032 102864 « (L1)/2 | |
32767 0.106 0.082 0.078 41486 | |
32768 0.079 0.078 0.079 41290 | |
65535 0.090 0.077 0.075 43565 | |
65536 0.074 0.074 0.073 44299 | |
131071 0.091 0.078 0.075 43196 | |
131072 0.078 0.076 0.074 43673 « (L2)/2 | |
262143 0.097 0.099 0.098 33192 | |
262144 0.098 0.098 0.098 33193 | |
524287 0.105 0.111 0.111 29212 | |
524288 0.109 0.111 0.111 29211 | |
1048575 0.107 0.108 0.108 30069 | |
1048576 0.106 0.112 0.105 30886 | |
2097151 0.105 0.103 0.103 31621 | |
2097152 0.102 0.103 0.104 31280 | |
4194303 0.180 0.158 0.176 18456 | |
4194304 0.167 0.155 0.154 21098 « (L3)/2 | |
8388607 0.538 0.576 0.557 5834 « RAM | |
8388608 0.750 0.579 0.552 5893 | |
glibc memcpy() for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 139.000 90.125 84.891 38 | |
1 83.000 82.125 84.359 39 | |
2 61.500 46.438 45.164 72 | |
3 41.667 32.458 31.245 104 | |
4 32.750 26.156 24.410 133 | |
7 20.143 16.732 16.033 203 | |
8 13.375 8.328 6.908 471 | |
15 8.200 6.408 5.753 565 | |
16 4.438 3.570 3.466 938 « | |
31 3.258 2.891 2.786 1167 | |
32 2.281 1.801 1.732 1878 | |
63 1.635 1.431 1.374 2367 | |
64 1.109 0.896 0.868 3747 « | |
127 0.921 0.792 0.779 4176 | |
128 0.508 0.511 0.494 6589 | |
255 0.451 0.407 0.402 8081 | |
256 0.324 0.269 0.260 12498 « | |
511 0.249 0.218 0.212 15335 | |
512 0.178 0.149 0.146 22297 « | |
1023 0.138 0.124 0.121 26947 | |
1024 0.087 0.089 0.087 37238 | |
2047 0.084 0.077 0.076 43046 | |
2048 0.066 0.059 0.058 56120 | |
4095 0.058 0.054 0.054 60706 | |
4096 0.050 0.046 0.046 71092 « | |
8191 0.043 0.042 0.042 78259 | |
8192 0.037 0.037 0.037 87409 | |
16383 0.037 0.036 0.035 92065 | |
16384 0.034 0.034 0.033 97942 « (L1)/2 | |
32767 0.104 0.084 0.080 40572 | |
32768 0.079 0.079 0.079 41055 | |
65535 0.094 0.080 0.076 42885 | |
65536 0.077 0.075 0.075 43423 | |
131071 0.092 0.080 0.078 41498 | |
131072 0.082 0.078 0.077 42350 « (L2)/2 | |
262143 0.100 0.101 0.287 11342 | |
262144 0.099 0.099 0.098 33177 | |
524287 0.106 0.111 0.110 29609 | |
524288 0.107 0.119 0.110 29608 | |
1048575 0.104 0.105 0.106 30626 | |
1048576 0.104 0.111 0.105 30878 | |
2097151 0.103 0.103 0.103 31606 | |
2097152 0.102 0.103 0.103 31644 | |
4194303 0.174 0.160 0.165 19714 | |
4194304 0.166 0.157 0.154 21110 « (L3)/2 | |
8388607 0.537 0.554 0.565 5750 « RAM | |
8388608 0.701 0.537 0.552 5884 | |
musl memcpy() for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 97.000 80.625 79.891 41 | |
1 77.000 78.875 78.266 42 | |
2 49.500 44.062 42.102 77 | |
3 33.667 32.792 30.651 106 | |
4 29.750 24.281 24.137 135 | |
7 19.000 16.161 15.734 207 | |
8 12.125 7.766 6.721 484 | |
15 8.867 5.892 5.714 569 | |
16 5.062 3.742 3.458 940 | |
31 3.645 2.915 2.715 1198 | |
32 2.156 1.723 1.663 1956 | |
63 1.540 1.367 1.333 2440 | |
64 1.078 0.873 0.833 3905 | |
127 0.874 0.771 0.737 4415 | |
128 0.617 0.487 0.481 6766 | |
255 0.443 0.390 0.382 8504 | |
256 0.316 0.259 0.259 12545 | |
511 0.245 0.232 0.237 13742 | |
512 0.174 0.159 0.208 15668 | |
1023 0.181 0.193 0.182 17821 | |
1024 0.155 0.123 0.114 28579 | |
2047 0.102 0.092 0.085 38219 | |
2048 0.064 0.073 0.070 46577 | |
4095 0.058 0.067 0.065 50272 | |
4096 0.049 0.055 0.055 59467 | |
8191 0.057 0.052 0.049 66468 | |
8192 0.053 0.050 0.051 63557 | |
16383 0.082 0.065 0.064 50897 | |
16384 0.066 0.065 0.061 53697 « (L1)/2 | |
32767 0.121 0.100 0.114 28555 | |
32768 0.093 0.091 0.114 28615 | |
65535 0.118 0.102 0.142 22858 | |
65536 0.108 0.274 0.097 33432 | |
131071 0.117 0.109 0.109 29905 | |
131072 0.110 0.195 0.113 28692 « (L2)/2 | |
262143 0.283 0.166 0.122 26638 | |
262144 0.130 0.144 0.123 26544 | |
524287 0.210 0.153 0.130 25079 | |
524288 0.126 0.128 0.123 26422 | |
1048575 0.139 0.107 0.106 30803 | |
1048576 0.104 0.105 0.106 30683 | |
2097151 0.103 0.103 0.103 31564 | |
2097152 0.102 0.103 0.103 31531 | |
4194303 0.242 0.158 0.169 19238 | |
4194304 0.166 0.161 0.154 21072 « (L3)/2 | |
8388607 0.533 0.549 0.599 5422 « RAM | |
8388608 0.768 0.630 0.560 5801 | |
newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 61.000 52.875 53.141 61 | |
1 49.000 49.875 50.328 65 | |
2 24.500 24.812 26.727 122 | |
3 15.667 20.125 16.943 192 | |
4 12.750 15.281 13.090 248 | |
7 7.000 7.375 7.431 438 | |
8 5.875 6.422 6.377 510 | |
15 3.267 3.375 3.447 943 | |
16 10.062 6.945 6.386 509 | |
31 2.548 2.488 2.545 1278 | |
32 3.156 3.207 3.201 1016 | |
63 1.190 1.220 1.229 2646 | |
64 1.578 1.588 1.599 2033 | |
127 0.717 0.690 0.685 4744 | |
128 0.820 0.856 0.857 3795 | |
255 0.357 0.359 0.358 9077 | |
256 0.629 0.461 0.426 7630 | |
511 0.260 0.219 0.204 15947 | |
512 0.330 0.299 0.268 12113 | |
1023 0.269 0.175 0.162 20042 | |
1024 0.315 0.201 0.196 16633 | |
2047 0.349 0.241 0.236 13790 | |
2048 0.332 0.269 0.264 12295 | |
4095 0.349 0.295 0.287 11348 | |
4096 0.361 0.313 0.303 10748 | |
8191 0.361 0.317 0.322 10110 | |
8192 0.369 0.326 0.319 10201 | |
16383 0.321 0.322 0.327 9940 | |
16384 0.309 0.330 0.329 9878 « (L1)/2 | |
32767 0.291 0.303 0.307 10599 | |
32768 0.314 0.304 0.305 10667 | |
65535 0.373 0.311 0.313 10396 | |
65536 0.305 0.750 0.421 7729 | |
131071 0.329 0.427 0.384 8470 | |
131072 0.329 0.388 0.361 9020 « (L2)/2 | |
262143 0.520 0.389 0.425 7646 | |
262144 0.364 0.400 0.368 8843 | |
524287 0.449 0.389 0.389 8353 | |
524288 0.384 0.379 0.384 8466 | |
1048575 0.436 0.397 0.401 8107 | |
1048576 0.431 0.397 0.401 8112 | |
2097151 0.417 0.567 0.434 7498 | |
2097152 0.457 0.503 0.427 7621 | |
4194303 0.328 0.348 0.368 8822 | |
4194304 0.343 0.352 0.352 9221 « (L3)/2 | |
8388607 0.313 0.319 0.326 9957 « RAM | |
8388608 0.366 0.320 0.328 9910 | |
openbsd memcpy() for #c per n where c ≈ 0.293ns | |
N x1 x8 x64 mBps | |
------------------------------------------------------------ | |
1 73.000 41.375 41.484 78 | |
1 39.000 39.875 41.641 78 | |
2 28.500 20.688 21.227 153 | |
3 27.000 15.875 15.557 209 | |
4 16.750 12.656 12.520 260 | |
7 20.429 10.982 10.292 316 | |
8 8.625 5.234 5.576 583 | |
15 7.267 4.758 4.920 661 | |
16 4.312 2.742 2.747 1183 | |
31 4.613 2.891 2.555 1272 | |
32 2.844 1.520 1.441 2256 | |
63 2.397 1.268 1.328 2449 | |
64 1.547 0.822 0.769 4226 | |
127 1.189 0.782 0.671 4842 | |
128 0.727 0.532 0.460 7066 | |
255 0.631 0.463 0.414 7856 | |
256 0.543 0.374 0.302 10775 | |
511 0.542 0.316 0.276 11785 | |
512 0.354 0.260 0.224 14494 | |
1023 0.267 0.245 0.229 14201 | |
1024 0.251 0.200 0.197 16496 | |
2047 0.214 0.226 0.181 17941 | |
2048 0.189 0.167 0.166 19575 | |
4095 0.200 0.168 0.163 19957 | |
4096 0.165 0.155 0.153 21219 | |
8191 0.158 0.153 0.151 21578 | |
8192 0.153 0.148 0.147 22138 | |
16383 0.173 0.148 0.146 22319 | |
16384 0.153 0.487 0.188 17298 « (L1)/2 | |
32767 0.161 0.151 0.192 16893 | |
32768 0.151 0.314 0.213 15275 | |
65535 0.157 0.154 0.148 21969 | |
65536 0.147 0.145 0.145 22493 | |
131071 0.152 0.151 0.154 21145 | |
131072 0.148 0.229 0.158 20564 « (L2)/2 | |
262143 0.320 0.183 0.162 20031 | |
262144 0.330 0.205 0.167 19503 | |
524287 0.159 0.171 0.163 19913 | |
524288 0.250 0.189 0.162 20120 | |
1048575 0.157 0.164 0.161 20182 | |
1048576 0.155 0.156 0.157 20672 | |
2097151 0.161 0.158 0.157 20644 | |
2097152 0.158 0.157 0.165 19727 | |
4194303 0.327 0.256 0.238 13684 | |
4194304 0.232 0.220 0.236 13749 « (L3)/2 | |
8388607 0.721 0.689 0.586 5549 « RAM | |
8388608 0.943 0.569 0.593 5481 */ | |
memcpy_test.cpp: | |
#include <string.h> | |
#include <iostream> | |
extern "C" int kCpuids; | |
extern "C" int kHalfCache3; | |
extern "C" int memcpytab; | |
extern "C" int memcpytab_ro; | |
extern "C" void _init_kCpuids(void *); | |
extern "C" void _init_kHalfCache3(void *); | |
extern "C" void _init_memcpy(void *, void *); | |
__attribute__((__constructor__)) void memcpy_init() | |
{ | |
_init_kCpuids(&kCpuids); | |
_init_kHalfCache3(&kHalfCache3); | |
_init_memcpy(&memcpytab, &memcpytab_ro); | |
} | |
__attribute__((__noinline__)) void memcpy_noinline(void * __restrict dst, const void * __restrict src, size_t size) | |
{ | |
memcpy(dst, src, size); | |
} | |
int main(int, char **) | |
{ | |
constexpr size_t buf_size = 100; | |
char buf[buf_size]{}; | |
memcpy_noinline(buf, "abc", 3); | |
size_t bytes_to_copy = 3; | |
while (bytes_to_copy * 2 < buf_size) | |
{ | |
memcpy_noinline(&buf[bytes_to_copy], buf, bytes_to_copy); | |
bytes_to_copy *= 2; | |
} | |
std::cerr << buf << "\n"; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment