alexey-milovidov · March 2, 2021 03:21
diff --git a/memcpy_test.txt b/memcpy_test.txt
 memcpy.S:

 /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │ Copyright 2020 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
 │ above copyright notice and this permission notice appear in all copies.      │
 │                                                                              │
 │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
 │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
 │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
 │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
 │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
 │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚──────────────────────────────────────────────────────────────────────────────╝
    @fileoverview Cosmopolitan Memory Copying

    Of all the functions in the technology industry, none are more
    critical than the Kernighan & Ritchie Memory Copy API for the C
    Language, 1972 model: more commonly known as memcpy(). It's the
    world's most popular function──one all programmers love.

    This implementation is the fastest and nearly the tiniest too.
    It doesn't break when copying backwards or on misaligned data.
    It's so easy that even a child could use it, and they do.
 */
 #include "libc/nexgen32e/x86feature.h"


 // Ends function definition.
 // @cost saves 1-3 lines of code
 .macro .endfn name:req bnd vis
 .size \name,.-\name
 .type \name,@function
 .ifnb \bnd
  .\bnd \name
 .endif
 .ifnb \vis
  .\vis \name
 .endif
 .endm

 // Ends variable definition.
 // @cost saves 1-3 lines of code
 .macro .endobj name:req bnd vis
 .size \name,.-\name
 .type \name,@object
 .ifnb \bnd
  .\bnd \name
 .endif
 .ifnb \vis
  .\vis \name
 .endif
 .endm

 // Helpers for Cosmopolitan _init() amalgamation magic.
 // @param name should be consistent across macros for a module
 // @see libc/runtime/_init.S
 .macro .initro number:req name:req
    .section .initro.\number\().\name,"a",@progbits
    .align 8
 .endm
 .macro .initbss number:req name:req
    .section .piro.bss.init.2.\number\().\name,"aw",@nobits
    .align 8
 .endm
 .macro .init.start number:req name:req
    .section .init.\number\().\name,"ax",@progbits
 \name:
 .endm
 .macro .init.end number:req name:req bnd=globl vis
    .endfn \name,\bnd,\vis
    .previous
 .endm

 // LOOP Instruction Replacement.
 // With its mop-Fusion Mexican equivalent.
 // Thus avoiding 3x legacy pipeline slowdown.
 .macro .loop label:req
    .byte 0x83,0xe9,0x01   # sub $1,%ecx
    jnz \label
 .endm

 // TODO(jart): delete
 // Loads Effective Address
 // Supporting security blankets
 .macro ezlea symbol:req reg:req
 #if __pic__ + __pie__ + __code_model_medium__ + __code_model_large__ + 0 > 1
 // lea \symbol(%rip),%r\reg
    mov $\symbol,%e\reg
 #else
    mov $\symbol,%e\reg
 #endif
 .endm


 // Copies memory.
 //
 // DEST and SRC must not overlap, unless DEST≤SRC.
 //
 // @param rdi is dest
 // @param rsi is src
 // @param rdx is number of bytes
 // @return original rdi copied to rax
 // @mode long
 // @asyncsignalsafe
 memcpy:
    /// memcpy function returns pointer to dst (its first argument), it's required by standard but rarely needed.
    /// That's why we also provide a function MemCpy that returns nothing and is one instruction less.

    mov %rdi, %rax

 // 𝑠𝑙𝑖𝑑𝑒
    .align 16
    .endfn memcpy,globl

 // Copies memory with minimal impact ABI.
 //
 // @param rdi is dest
 // @param rsi is src
 // @param rdx is number of bytes
 // @clob flags,rcx,xmm3,xmm4
 // @mode long
 MemCpy:
    push %rbp
    mov  %rsp, %rbp

    mov $.Lmemcpytab_ro.size, %ecx

    /// rcx - the size of the static part of memcpytab_ro (a small value)
    /// rdx - the number of bytes to copy

    /// If the number of bytes is smaller than the size of static table - jump to memcpytab[size * 8]
    /// If it is larger - jump to the end of memcpytab

    cmp %rcx, %rdx
    cmovb %rdx, %rcx
    jmp *memcpytab(,%rcx,8)

 /// This label is needed only to calculate differencies in the memcpytab_ro
 .Lanchorpoint:

 /// For large sizes we will branch to ERMS and Non-Temporary stores implementations.
 .L32r:
    cmp $1024,%rdx
    jae .Lerms

 .L32:
    /// We will save the latest 32 bytes before the loop
    vmovdqu -32(%rsi,%rdx),%ymm4

    /// rcx will be the number of bytes processed so far
    mov $32,%rcx

    /// The main AVX loop: copy by 32 byte portions from the beginning up to the latest 32 bytes.
 0:
    add $32,%rcx
    vmovdqu -64(%rsi,%rcx),%ymm3
    vmovdqu %ymm3,-64(%rdi,%rcx)
    cmp %rcx,%rdx
    ja 0b

    /// Now copy the latest 32 bytes. They can overlap previously copied 32 bytes if the size is not a multiple of 32.
    /// Example: when copying 33 bytes, the sequence is the following:
    /// 1. Save the latest 32 bytes.
    /// 2. Copy first 32 bytes.
    /// 3. Store the latest 32 bytes.

    vmovdqu %ymm4,-32(%rdi,%rdx)

    vxorps %ymm4,%ymm4,%ymm4
    vxorps %ymm3,%ymm3,%ymm3
    jmp .L0

 /// For large sizes we will branch to ERMS and Non-Temporary stores implementations.
 .L16r:
    cmp $1024,%rdx
    jae .Lerms

 /// This is very similar to AVX but for SSE
 .L16:
    movdqu -16(%rsi,%rdx),%xmm4
    mov $16,%rcx

 0:
    add $16,%rcx
    movdqu -32(%rsi,%rcx),%xmm3
    movdqu %xmm3,-32(%rdi,%rcx)
    cmp %rcx,%rdx
    ja 0b

    movdqu %xmm4,-16(%rdi,%rdx)

    pxor %xmm4,%xmm4
    pxor %xmm3,%xmm3
    jmp .L0

 /// 9..16 remaining bytes.
 .L8:
    /// We will use rbx, so save the previous value on stack.
    push %rbx

    /// Copy them by two halves to allow overlapping regions.
    mov (%rsi),%rcx
    mov -8(%rsi,%rdx),%rbx
    mov %rcx,(%rdi)
    mov %rbx,-8(%rdi,%rdx)

 /// Restore the value of rbx and return
 1:
    pop %rbx

 /// Zero bytes remaining, exit from function
 .L0:
    pop %rbp
    ret

 /// 5..8 remaining bytes
 .L4:
    /// We will use rbx, so save the previous value on stack.
    push %rbx

    /// The current four bytes
    mov (%rsi),%ecx
    /// The latest four bytes
    mov -4(%rsi,%rdx),%ebx
    mov %ecx,(%rdi)
    mov %ebx,-4(%rdi,%rdx)

    /// Restore the value of rbx and return
    jmp 1b

 /// 3..4 remaining bytes
 .L3:
    /// We will use rbx, so save the previous value on stack.
    push %rbx

    /// The current two bytes
    mov (%rsi),%cx
    /// The latest two bytes
    mov -2(%rsi,%rdx),%bx
    mov %cx,(%rdi)
    mov %bx,-2(%rdi,%rdx)

    /// Restore the value of rbx and return
    jmp 1b

 /// Two remaining bytes
 .L2:
    mov (%rsi),%cx
    mov %cx,(%rdi)
    jmp .L0

 /// One remaining byte
 .L1:
    mov (%rsi),%cl
    mov %cl,(%rdi)
    jmp .L0

 /// The case when CPU has "erms" flag. It means "Enhanced REP MOVSB/STOSB",
 /// and literally: "rep movsb" is a good way for memcpy.
 /// But in fact it's profitable only for sizes larger than 1024 bytes but less than half of L3 cache size.

 .Lerms:
    /// For large size, go to Non-Temporary stores implementation.
    cmp kHalfCache3(%rip),%rdx
    ja .Lnts

    /// "rep movsb" will clobber these registers, but our function should not. Save them and restore before return.
    push %rdi
    push %rsi

    /// This is where "rep movsb" expect to have its counter.
    mov %rdx,%rcx

    rep movsb

    pop %rsi
    pop %rdi

    /// Return
    jmp .L0

 /// The case for non-temporary stores.
 /// Non-temporary stores are operations that bypass CPU cache (store directly to memory).
 /// They are profitable if size is larger than half of L3 cache size.
 /// NOTE: when doing memcpy from multiple threads, most likely it will be better to lower
 /// the threshold, because the L3 cache is shared among cores.

 .Lnts:
    /// Copy the first 16 bytes.
    movdqu (%rsi),%xmm3
    movdqu %xmm3,(%rdi)

    /// Align the address of dest + 16 down to 16 bytes boundary and store to rcx.
    lea 16(%rdi),%rcx
    and $-16,%rcx

    /// Align both source and destination and set size to the number of remaining bytes
    sub %rdi,%rcx
    add %rcx,%rdi
    add %rcx,%rsi
    sub %rcx,%rdx

    /// rcx will be the number of bytes processed so far
    mov $16,%rcx

    /// Main loop
 0:
    add $16,%rcx
    movdqu -32(%rsi,%rcx),%xmm3
    movntdq %xmm3,-32(%rdi,%rcx)
    cmp %rcx,%rdx
    ja 0b

    /// Copy the latest 16 bytes (that may overlap already copied bytes).
    /// We need to make the previous stores visible for this operation.
    sfence
    movdqu -16(%rsi,%rdx),%xmm3
    movdqu %xmm3,-16(%rdi,%rdx)

    pxor %xmm3,%xmm3
    jmp .L0

    .endfn MemCpy,globl,hidden


 .initro 300,_init_memcpy
 memcpytab_ro:
    /// This table is indexed by the number of remaining bytes.
    .byte .L0-.Lanchorpoint
    .byte .L1-.Lanchorpoint
    .byte .L2-.Lanchorpoint
    .byte .L3-.Lanchorpoint
    .byte .L4-.Lanchorpoint
    .byte .L4-.Lanchorpoint
    .byte .L4-.Lanchorpoint
    .byte .L4-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint
    .byte .L8-.Lanchorpoint /// For example, 15 bytes can be copied by two 8-byte unaligned load/stores.
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint
    .byte .L16-.Lanchorpoint

    .equ .Lmemcpytab_ro.size,.-memcpytab_ro
    .endobj memcpytab_ro

    .if .Lmemcpytab_ro.size % 8
        .error "moar jmptab"
    .endif

    .byte .L16-.Lanchorpoint  # SSE2
    .byte .L16r-.Lanchorpoint  # SSE2 + ERMS
    .byte .L32-.Lanchorpoint  # AVX
    .byte .L32r-.Lanchorpoint  # AVX + ERMS
    .byte 0,0,0,0

    .endobj memcpytab_ro, globl
    .previous   /// This means: "undo" section declaration: https://stackoverflow.com/questions/2416879/what-does-asm-previous-mean


 /// It is filled at startup with pointers according to the offsets in memcpytab_ro
 .initbss 300,_init_memcpy
 memcpytab:
    .rept .Lmemcpytab_ro.size
        .quad 0 /// Pointers for small sizes.
    .endr
    .quad 0 /// A pointer to the label for large size.

    .endobj memcpytab, globl
    .previous


 .init.start 300,_init_memcpy
    ezlea .Lmemcpytab_ro.size, cx
    ezlea .Lanchorpoint, dx

    testb X86_HAVE(AVX) + kCpuids(%rip)
    call memjmpinit
    ret
 .init.end 300,_init_memcpy


 // Initializes jump table for memset() and memcpy().
 //
 // @param !ZF if required cpu vector extensions are available
 // @param rdi is address of 64-bit jump table
 // @param rsi is address of 8-bit jump initializers
 // @param rdx is address of indirect branch (Anchor point)
 // @param ecx is size of jump table
 memjmpinit:
    push %rbp
    mov  %rsp, %rbp

    setnz %r8b
    shl %r8b

    /// Transform memcpytab_ro with byte offsets to pointers.
 0: xor %eax,%eax
    lodsb
    add %rdx,%rax
    stosq
    .loop 0b

    xor %eax,%eax
    testb X86_HAVE(ERMS) + kCpuids(%rip)
    setnz %al
    or %r8b,%al
    mov (%rsi,%rax),%al
    add %rdx,%rax
    stosq
    lodsq

    pop %rbp
    ret

    .endfn memjmpinit,globl,hidden


 // Globally precomputed CPUID.
 //
 // This module lets us check CPUID in 0.06ns rather than 51.00ns.
 // If every piece of native software linked this module, then the
 // world would be a much better place; since all the alternatives
 // are quite toilsome.
 //
 // @see www.felixcloutier.com/x86/cpuid
 .initbss 201,_init_kCpuids
 kCpuids:
    .long 0,0,0,0    # EAX=0 (Basic Processor Info)
    .long 0,0,0,0    # EAX=1 (Processor Info)
    .long 0,0,0,0    # EAX=2
    .long 0,0,0,0    # EAX=7 (Extended Features)
    .long 0,0,0,0    # EAX=0x80000001 (NexGen32e)
    .long 0,0,0,0    # EAX=0x80000007 (APM)
    .long 0,0,0,0    # EAX=16h (CPU Frequency)
    .endobj kCpuids,globl
    .previous

 .init.start 201,_init_kCpuids
    push %rbx
    push $0
    push $0x16
    push $0xffffffff80000007
    push $0xffffffff80000001
    push $7
    push $2
    push $1
    mov %rdi,%r8
    xor %eax,%eax
 1: xor %ecx,%ecx
    cpuid
    stosl
    xchg %eax,%ebx
    stosl
    xchg %eax,%ecx
    stosl
    xchg %eax,%edx
    stosl
 2: pop %rax
    test %eax,%eax   # EAX = stacklist->pop()
    jz 3f    # EAX ≠ 0 (EOL sentinel)
    cmp KCPUIDS(0H,EAX)(%r8),%al # EAX ≤ CPUID.0 max leaf
    jbe 1b    # CPUID too new to probe
    add $4*4,%rdi
    jmp 2b
 3: nop
 /*
 #if !X86_NEED(AVX2)
    testb X86_HAVE(AVX)(%r8)
    jz 5f
    testb X86_HAVE(OSXSAVE)(%r8)
    jz 4f
    xor %ecx,%ecx
    xgetbv
    and $XCR0_SSE|XCR0_AVX,%eax
    cmp $XCR0_SSE|XCR0_AVX,%eax
    je 5f
 4: btr $X86_BIT(AVX),X86_WORD(AVX)(%r8)
    btr $X86_BIT(AVX2),X86_WORD(AVX2)(%r8)
 #endif*/
 5: pop %rbx
    retq
    .init.end 201,_init_kCpuids


 .initbss 202,_init_kHalfCache3
 // Half size of level 3 cache in bytes.
 kHalfCache3:
    .quad 0
    .endobj kHalfCache3,globl
    .previous

 .init.start 202,_init_kHalfCache3
    cmpl $3,kCpuids(%rip)
    jbe 3f
    xor %r8d,%r8d
    mov $4,%r8d
 1: mov %r8d,%eax
    mov %r8d,%ecx
    push %rbx
    cpuid
    mov %ebx,%r9d
    pop %rbx
    test $31,%al
    je 3f
    cmp $99,%al
    jne 2f
    mov %r9d,%eax
    mov %r9d,%edx
    inc %ecx
    shr $12,%r9d
    shr $22,%eax
    and $0x0fff,%edx
    and $0x03ff,%r9d
    inc %eax
    inc %edx
    imul %edx,%eax
    imul %ecx,%eax
    lea 1(%r9),%ecx
    imul %ecx,%eax
    jmp 4f
 2: inc %r8d
    jmp 1b
 3: mov $0x00400000,%eax
 4: shr %eax
    stosq
    retq
    .init.end 202,_init_kHalfCache3



 /* your memcpy()  375 bytes
    bionic memcpy()  1,429 bytes
    glibc memcpy()  27,216 bytes
    musl memcpy()  49 bytes
    newlib memcpy()  300 bytes

    benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
    includes function call overhead (unless marked otherwise)

    your memcpy(𝑛) for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                297.000        35.125        35.203      92
    1                 35.000        35.625        35.016      93
    2                 27.500        17.438        17.555     185
    3                 21.000        11.875        12.057     270
    4                 16.250         8.719         8.809     369
    7                  5.000         4.946         5.069     641
    8                  7.375         4.422         4.365     745
    15                 4.067         2.342         2.336    1391
    16                 4.188         2.242         2.257    1440 «
    31                 8.032         1.157         1.147    2835
    32                 2.031         1.723         1.325    2454
    63                 1.000         0.589         0.589    5523
    64                 0.578         0.580         0.577    5630 «
    127                0.638         0.377         0.320   10151
    128                0.289         0.296         0.307   10605
    255                0.404         0.202         0.194   16741
    256                0.160         0.165         0.166   19574 «
    511                0.159         0.123         0.110   29458
    512                0.139         0.098         0.097   33571 «
    1023               0.107         0.086         0.074   44111
    1024               0.103         0.084         0.082   39489
    2047               0.057         0.056         0.057   57450
    2048               0.055         0.055         0.055   59269
    4095               0.044         0.044         0.044   74051
    4096               0.043         0.043         0.043   75300 «
    8191               0.036         0.036         0.036   91301
    8192               0.036         0.035         0.035   92411
    16383              0.033         0.032         0.032  102163
    16384              0.034         0.032         0.032  102145 « (L1)/2
    32767              0.098         0.081         0.077   42271
    32768              0.077         0.077         0.076   42781
    65535              0.088         0.075         0.072   44973
    65536              0.074         0.072         0.071   45520
    131071             0.086         0.075         0.072   44869
    131072             0.077         0.073         0.072   45076 « (L2)/2
    262143             0.095         0.096         0.095   34116
    262144             0.096         0.096         0.095   34160
    524287             0.102         0.109         0.111   29359
    524288             0.107         0.109         0.108   30033
    1048575            0.102         0.103         0.104   31112
    1048576            0.101         0.103         0.103   31605
    2097151            0.104         0.103         0.109   29929
    2097152            0.108         0.110         0.103   31652
    4194303            0.192         0.172         0.172   18950
    4194304            0.168         0.161         0.160   20311 « (L3)/2
    8388607            0.339         0.329         0.344    9461 « RAM
    8388608            0.384         0.369         0.341    9545

    Bionic memcpy() for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                347.000        40.625        35.984      90
    1                 37.000        35.625        36.734      89
    2                 28.500        18.688        18.383     177
    3                 11.667        12.375        12.359     263
    4                 12.250         9.406         9.020     361
    7                  5.000         5.018         5.118     636
    8                 11.625         5.828         4.779     681
    15                 3.533         3.158         2.620    1243
    16                 4.688         2.742         2.884    1129 «
    31                 1.903         1.262         1.172    2778
    32                 1.344         1.113         1.125    2895
    63                 1.444         0.633         0.591    5513
    64                 0.766         0.580         0.581    5605 «
    127                0.512         0.383         0.318   10229
    128                0.461         0.315         0.311   10463
    255                0.475         0.216         0.193   16840
    256                0.371         0.236         0.199   16397 «
    511                0.295         0.144         0.120   27223
    512                0.240         0.151         0.126   25937 «
    1023               0.142         0.101         0.088   36947
    1024               0.126         0.108         0.091   35889
    2047               0.088         0.074         0.072   45475
    2048               0.089         0.077         0.073   44380
    4095               0.081         0.065         0.064   50766
    4096               0.068         0.066         0.065   50246 «
    8191               0.063         0.061         0.060   54075
    8192               0.065         0.061         0.061   53731
    16383              0.082         0.066         0.061   53765
    16384              0.067         0.063         0.062   52765 « (L1)/2
    32767              0.102         0.085         0.085   38406
    32768              0.086         0.085         0.085   38473
    65535              0.098         0.085         0.085   38292
    65536              0.086         0.085         0.085   38369
    131071             0.438         0.177         0.089   36716
    131072             0.092         0.090         0.093   34880 « (L2)/2
    262143             0.306         0.146         0.127   25601
    262144             0.126         0.168         0.127   25704
    524287             0.213         0.152         0.136   23993
    524288             0.132         0.159         0.133   24570
    1048575            0.127         0.129         0.130   25117
    1048576            0.128         0.129         0.130   25107
    2097151            0.127         0.127         0.129   25199
    2097152            0.127         0.136         0.134   24274
    4194303            0.216         0.192         0.228   14237
    4194304            0.351         0.351         0.356    9139 « (L3)/2
    8388607            0.323         0.293         0.298   10903 « RAM
    8388608            0.365         0.296         0.300   10844

    GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                 53.000        50.625        50.453      64
    1                 47.000        49.375        49.141      66
    2                 23.500        25.062        24.898     131
    3                 15.667        16.792        16.880     193
    4                 11.750        12.531        12.957     251
    7                  7.000         7.125         7.190     452
    8                  6.125         7.578         6.322     514
    15                 3.133         3.325         3.372     964
    16                 3.062         3.117         3.132    1038 «
    31                 1.645         1.601         1.620    2007
    32                 1.531         1.559         1.585    2051
    63                 0.778         0.796         0.802    4056
    64                 0.766         0.768         0.767    4238 «
    127                0.480         0.446         0.448    7259
    128                0.445         0.419         0.423    7693
    255                0.239         0.239         0.236   13781
    256                0.238         0.225         0.225   14466 «
    511                0.127         0.133         0.132   24555
    512                0.123         0.127         0.128   25377 «
    1023               0.079         0.081         0.081   40346
    1024               0.075         0.077         0.078   41714
    2047               0.053         0.055         0.055   59575
    2048               0.053         0.053         0.053   60795
    4095               0.042         0.043         0.043   75843
    4096               0.042         0.042         0.042   77153
    8191               0.035         0.036         0.036   91518
    8192               0.035         0.035         0.035   92603
    16383              0.032         0.032         0.032  102407
    16384              0.033         0.032         0.032  102864 « (L1)/2
    32767              0.106         0.082         0.078   41486
    32768              0.079         0.078         0.079   41290
    65535              0.090         0.077         0.075   43565
    65536              0.074         0.074         0.073   44299
    131071             0.091         0.078         0.075   43196
    131072             0.078         0.076         0.074   43673 « (L2)/2
    262143             0.097         0.099         0.098   33192
    262144             0.098         0.098         0.098   33193
    524287             0.105         0.111         0.111   29212
    524288             0.109         0.111         0.111   29211
    1048575            0.107         0.108         0.108   30069
    1048576            0.106         0.112         0.105   30886
    2097151            0.105         0.103         0.103   31621
    2097152            0.102         0.103         0.104   31280
    4194303            0.180         0.158         0.176   18456
    4194304            0.167         0.155         0.154   21098 « (L3)/2
    8388607            0.538         0.576         0.557    5834 « RAM
    8388608            0.750         0.579         0.552    5893

    glibc memcpy() for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                139.000        90.125        84.891      38
    1                 83.000        82.125        84.359      39
    2                 61.500        46.438        45.164      72
    3                 41.667        32.458        31.245     104
    4                 32.750        26.156        24.410     133
    7                 20.143        16.732        16.033     203
    8                 13.375         8.328         6.908     471
    15                 8.200         6.408         5.753     565
    16                 4.438         3.570         3.466     938 «
    31                 3.258         2.891         2.786    1167
    32                 2.281         1.801         1.732    1878
    63                 1.635         1.431         1.374    2367
    64                 1.109         0.896         0.868    3747 «
    127                0.921         0.792         0.779    4176
    128                0.508         0.511         0.494    6589
    255                0.451         0.407         0.402    8081
    256                0.324         0.269         0.260   12498 «
    511                0.249         0.218         0.212   15335
    512                0.178         0.149         0.146   22297 «
    1023               0.138         0.124         0.121   26947
    1024               0.087         0.089         0.087   37238
    2047               0.084         0.077         0.076   43046
    2048               0.066         0.059         0.058   56120
    4095               0.058         0.054         0.054   60706
    4096               0.050         0.046         0.046   71092 «
    8191               0.043         0.042         0.042   78259
    8192               0.037         0.037         0.037   87409
    16383              0.037         0.036         0.035   92065
    16384              0.034         0.034         0.033   97942 « (L1)/2
    32767              0.104         0.084         0.080   40572
    32768              0.079         0.079         0.079   41055
    65535              0.094         0.080         0.076   42885
    65536              0.077         0.075         0.075   43423
    131071             0.092         0.080         0.078   41498
    131072             0.082         0.078         0.077   42350 « (L2)/2
    262143             0.100         0.101         0.287   11342
    262144             0.099         0.099         0.098   33177
    524287             0.106         0.111         0.110   29609
    524288             0.107         0.119         0.110   29608
    1048575            0.104         0.105         0.106   30626
    1048576            0.104         0.111         0.105   30878
    2097151            0.103         0.103         0.103   31606
    2097152            0.102         0.103         0.103   31644
    4194303            0.174         0.160         0.165   19714
    4194304            0.166         0.157         0.154   21110 « (L3)/2
    8388607            0.537         0.554         0.565    5750 « RAM
    8388608            0.701         0.537         0.552    5884

    musl memcpy() for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                 97.000        80.625        79.891      41
    1                 77.000        78.875        78.266      42
    2                 49.500        44.062        42.102      77
    3                 33.667        32.792        30.651     106
    4                 29.750        24.281        24.137     135
    7                 19.000        16.161        15.734     207
    8                 12.125         7.766         6.721     484
    15                 8.867         5.892         5.714     569
    16                 5.062         3.742         3.458     940
    31                 3.645         2.915         2.715    1198
    32                 2.156         1.723         1.663    1956
    63                 1.540         1.367         1.333    2440
    64                 1.078         0.873         0.833    3905
    127                0.874         0.771         0.737    4415
    128                0.617         0.487         0.481    6766
    255                0.443         0.390         0.382    8504
    256                0.316         0.259         0.259   12545
    511                0.245         0.232         0.237   13742
    512                0.174         0.159         0.208   15668
    1023               0.181         0.193         0.182   17821
    1024               0.155         0.123         0.114   28579
    2047               0.102         0.092         0.085   38219
    2048               0.064         0.073         0.070   46577
    4095               0.058         0.067         0.065   50272
    4096               0.049         0.055         0.055   59467
    8191               0.057         0.052         0.049   66468
    8192               0.053         0.050         0.051   63557
    16383              0.082         0.065         0.064   50897
    16384              0.066         0.065         0.061   53697 « (L1)/2
    32767              0.121         0.100         0.114   28555
    32768              0.093         0.091         0.114   28615
    65535              0.118         0.102         0.142   22858
    65536              0.108         0.274         0.097   33432
    131071             0.117         0.109         0.109   29905
    131072             0.110         0.195         0.113   28692 « (L2)/2
    262143             0.283         0.166         0.122   26638
    262144             0.130         0.144         0.123   26544
    524287             0.210         0.153         0.130   25079
    524288             0.126         0.128         0.123   26422
    1048575            0.139         0.107         0.106   30803
    1048576            0.104         0.105         0.106   30683
    2097151            0.103         0.103         0.103   31564
    2097152            0.102         0.103         0.103   31531
    4194303            0.242         0.158         0.169   19238
    4194304            0.166         0.161         0.154   21072 « (L3)/2
    8388607            0.533         0.549         0.599    5422 « RAM
    8388608            0.768         0.630         0.560    5801

    newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                 61.000        52.875        53.141      61
    1                 49.000        49.875        50.328      65
    2                 24.500        24.812        26.727     122
    3                 15.667        20.125        16.943     192
    4                 12.750        15.281        13.090     248
    7                  7.000         7.375         7.431     438
    8                  5.875         6.422         6.377     510
    15                 3.267         3.375         3.447     943
    16                10.062         6.945         6.386     509
    31                 2.548         2.488         2.545    1278
    32                 3.156         3.207         3.201    1016
    63                 1.190         1.220         1.229    2646
    64                 1.578         1.588         1.599    2033
    127                0.717         0.690         0.685    4744
    128                0.820         0.856         0.857    3795
    255                0.357         0.359         0.358    9077
    256                0.629         0.461         0.426    7630
    511                0.260         0.219         0.204   15947
    512                0.330         0.299         0.268   12113
    1023               0.269         0.175         0.162   20042
    1024               0.315         0.201         0.196   16633
    2047               0.349         0.241         0.236   13790
    2048               0.332         0.269         0.264   12295
    4095               0.349         0.295         0.287   11348
    4096               0.361         0.313         0.303   10748
    8191               0.361         0.317         0.322   10110
    8192               0.369         0.326         0.319   10201
    16383              0.321         0.322         0.327    9940
    16384              0.309         0.330         0.329    9878 « (L1)/2
    32767              0.291         0.303         0.307   10599
    32768              0.314         0.304         0.305   10667
    65535              0.373         0.311         0.313   10396
    65536              0.305         0.750         0.421    7729
    131071             0.329         0.427         0.384    8470
    131072             0.329         0.388         0.361    9020 « (L2)/2
    262143             0.520         0.389         0.425    7646
    262144             0.364         0.400         0.368    8843
    524287             0.449         0.389         0.389    8353
    524288             0.384         0.379         0.384    8466
    1048575            0.436         0.397         0.401    8107
    1048576            0.431         0.397         0.401    8112
    2097151            0.417         0.567         0.434    7498
    2097152            0.457         0.503         0.427    7621
    4194303            0.328         0.348         0.368    8822
    4194304            0.343         0.352         0.352    9221 « (L3)/2
    8388607            0.313         0.319         0.326    9957 « RAM
    8388608            0.366         0.320         0.328    9910

    openbsd memcpy() for #c per n where c ≈ 0.293ns
    N                     x1            x8           x64 mBps
    ------------------------------------------------------------
    1                 73.000        41.375        41.484      78
    1                 39.000        39.875        41.641      78
    2                 28.500        20.688        21.227     153
    3                 27.000        15.875        15.557     209
    4                 16.750        12.656        12.520     260
    7                 20.429        10.982        10.292     316
    8                  8.625         5.234         5.576     583
    15                 7.267         4.758         4.920     661
    16                 4.312         2.742         2.747    1183
    31                 4.613         2.891         2.555    1272
    32                 2.844         1.520         1.441    2256
    63                 2.397         1.268         1.328    2449
    64                 1.547         0.822         0.769    4226
    127                1.189         0.782         0.671    4842
    128                0.727         0.532         0.460    7066
    255                0.631         0.463         0.414    7856
    256                0.543         0.374         0.302   10775
    511                0.542         0.316         0.276   11785
    512                0.354         0.260         0.224   14494
    1023               0.267         0.245         0.229   14201
    1024               0.251         0.200         0.197   16496
    2047               0.214         0.226         0.181   17941
    2048               0.189         0.167         0.166   19575
    4095               0.200         0.168         0.163   19957
    4096               0.165         0.155         0.153   21219
    8191               0.158         0.153         0.151   21578
    8192               0.153         0.148         0.147   22138
    16383              0.173         0.148         0.146   22319
    16384              0.153         0.487         0.188   17298 « (L1)/2
    32767              0.161         0.151         0.192   16893
    32768              0.151         0.314         0.213   15275
    65535              0.157         0.154         0.148   21969
    65536              0.147         0.145         0.145   22493
    131071             0.152         0.151         0.154   21145
    131072             0.148         0.229         0.158   20564 « (L2)/2
    262143             0.320         0.183         0.162   20031
    262144             0.330         0.205         0.167   19503
    524287             0.159         0.171         0.163   19913
    524288             0.250         0.189         0.162   20120
    1048575            0.157         0.164         0.161   20182
    1048576            0.155         0.156         0.157   20672
    2097151            0.161         0.158         0.157   20644
    2097152            0.158         0.157         0.165   19727
    4194303            0.327         0.256         0.238   13684
    4194304            0.232         0.220         0.236   13749 « (L3)/2
    8388607            0.721         0.689         0.586    5549 « RAM
    8388608            0.943         0.569         0.593    5481 */


 memcpy_test.cpp:

 #include <string.h>
 #include <iostream>


 extern "C" int kCpuids;
 extern "C" int kHalfCache3;
 extern "C" int memcpytab;
 extern "C" int memcpytab_ro;

 extern "C" void _init_kCpuids(void *);
 extern "C" void _init_kHalfCache3(void *);
 extern "C" void _init_memcpy(void *, void *);

 __attribute__((__constructor__)) void memcpy_init()
 {
    _init_kCpuids(&kCpuids);
    _init_kHalfCache3(&kHalfCache3);
    _init_memcpy(&memcpytab, &memcpytab_ro);
 }


 __attribute__((__noinline__)) void memcpy_noinline(void * __restrict dst, const void * __restrict src, size_t size)
 {
    memcpy(dst, src, size);
 }


 int main(int, char **)
 {
    constexpr size_t buf_size = 100;
    char buf[buf_size]{};
    memcpy_noinline(buf, "abc", 3);

    size_t bytes_to_copy = 3;
    while (bytes_to_copy * 2 < buf_size)
    {
        memcpy_noinline(&buf[bytes_to_copy], buf, bytes_to_copy);
        bytes_to_copy *= 2;
    }

    std::cerr << buf << "\n";
    return 0;
 }