Created
May 8, 2019 10:32
-
-
Save cocowalla/bb7e735b988a8cfec9770f1a0ee6a4a6 to your computer and use it in GitHub Desktop.
SIMD XOR Optimisation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Core CLR v4.6.27615.73 (coreclr.dll) on amd64. | |
MyClass..ctor() | |
L0000: push rbp | |
L0001: sub rsp, 0x20 | |
L0005: lea rbp, [rsp+0x20] | |
L000a: mov [rbp+0x10], rcx | |
L000e: cmp dword [rip+0xbffb], 0x0 | |
L0015: jz L001c | |
L0017: call 0x7ffc61fad9e0 | |
L001c: mov rcx, [rbp+0x10] | |
L0020: call System.Object..ctor() | |
L0025: nop | |
L0026: nop | |
L0027: lea rsp, [rbp] | |
L002b: pop rbp | |
L002c: ret | |
MyClass.MyMethod(Byte[]) | |
L0000: push rbp | |
L0001: push rdi | |
L0002: push rsi | |
L0003: sub rsp, 0x170 | |
L000a: vzeroupper | |
L000d: lea rbp, [rsp+0x180] | |
L0015: mov rsi, rcx | |
L0018: lea rdi, [rbp-0xc0] | |
L001f: mov ecx, 0x2c | |
L0024: xor eax, eax | |
L0026: rep stosd | |
L0028: mov rcx, rsi | |
L002b: mov [rbp+0x10], rcx | |
L002f: cmp dword [rip+0xbf8a], 0x0 | |
L0036: jz L003d | |
L0038: call 0x7ffc61fad9e0 | |
L003d: nop | |
L003e: mov rcx, 0x1d8e4e27c47d124f | |
L0048: mov [rsp+0x20], rcx | |
L004d: lea rcx, [rbp-0xf0] | |
L0054: mov rdx, 0xe7037ed1a0b428db | |
L005e: mov r8, 0x8ebc6af09c88c6e3 | |
L0068: mov r9, 0x589965cc75374cc3 | |
L0072: call System.Runtime.Intrinsics.Vector256.Create(UInt64, UInt64, UInt64, UInt64) | |
L0077: vmovupd ymm0, [rbp-0xf0] | |
L007f: vmovupd [rbp-0x30], ymm0 | |
L0084: mov rax, [rbp+0x10] | |
L0088: mov [rbp-0x40], rax | |
L008c: cmp qword [rbp+0x10], 0x0 | |
L0091: jz L009d | |
L0093: mov rax, [rbp-0x40] | |
L0097: cmp dword [rax+0x8], 0x0 | |
L009b: jnz L00a8 | |
L009d: xor eax, eax | |
L009f: mov eax, eax | |
L00a1: mov [rbp-0x38], rax | |
L00a5: nop | |
L00a6: jmp L00d1 | |
L00a8: mov rax, [rbp-0x40] | |
L00ac: xor edx, edx | |
L00ae: cmp edx, [rax+0x8] | |
L00b1: jb L00b8 | |
L00b3: call 0x7ffc61fafd40 | |
L00b8: mov ecx, edx | |
L00ba: lea rax, [rax+rcx+0x10] | |
L00bf: mov [rbp-0x158], rax | |
L00c6: mov rax, [rbp-0x158] | |
L00cd: mov [rbp-0x38], rax | |
L00d1: nop | |
L00d2: mov rax, [rbp-0x38] | |
L00d6: mov [rbp-0x48], rax | |
L00da: xor eax, eax | |
L00dc: mov [rbp-0x4c], eax | |
L00df: nop | |
L00e0: jmp L01cc | |
L00e5: nop | |
L00e6: mov rax, [rbp-0x48] | |
L00ea: mov edx, [rbp-0x4c] | |
L00ed: movsxd rdx, edx | |
L00f0: vmovdqu ymm0, [rax+rdx] | |
L00f5: vmovupd [rbp-0x110], ymm0 | |
L00fd: vmovupd ymm0, [rbp-0x110] | |
L0105: vmovupd [rbp-0x70], ymm0 | |
L010a: vmovupd ymm0, [rbp-0x30] | |
L010f: vpxor xmm0, xmm0, [rbp-0x70] | |
L0114: vmovupd [rbp-0x130], ymm0 | |
L011c: vmovupd ymm0, [rbp-0x130] | |
L0124: vmovupd [rbp-0x90], ymm0 | |
L012c: vmovdqu ymm0, [rbp-0x90] | |
L0134: vmovd rax, xmm0 | |
L0139: mov [rbp-0x138], rax | |
L0140: mov rax, [rbp-0x138] | |
L0147: mov [rbp-0x98], rax | |
L014e: vmovdqu ymm0, [rbp-0x90] | |
L0156: invalid | |
L015a: rol byte [rcx], 0x48 | |
L015d: mov [rbp-0x140], eax | |
L0163: mov rax, [rbp-0x140] | |
L016a: mov [rbp-0xa0], rax | |
L0171: vmovupd ymm0, [rbp-0x90] | |
L0179: vextractf128 xmm0, ymm0, 0x1 | |
L017f: vmovd rax, xmm0 | |
L0184: mov [rbp-0x148], rax | |
L018b: mov rax, [rbp-0x148] | |
L0192: mov [rbp-0xa8], rax | |
L0199: vmovupd ymm0, [rbp-0x90] | |
L01a1: vextractf128 xmm0, ymm0, 0x1 | |
L01a7: invalid | |
L01ab: rol byte [rcx], 0x48 | |
L01ae: mov [rbp-0x150], eax | |
L01b4: mov rax, [rbp-0x150] | |
L01bb: mov [rbp-0xb0], rax | |
L01c2: nop | |
L01c3: mov eax, [rbp-0x4c] | |
L01c6: add eax, 0x20 | |
L01c9: mov [rbp-0x4c], eax | |
L01cc: mov eax, [rbp-0x4c] | |
L01cf: add eax, 0x20 | |
L01d2: mov rdx, [rbp+0x10] | |
L01d6: cmp eax, [rdx+0x8] | |
L01d9: setle al | |
L01dc: movzx eax, al | |
L01df: mov [rbp-0xb4], eax | |
L01e5: cmp dword [rbp-0xb4], 0x0 | |
L01ec: jnz MyClass.MyMethod(Byte[]) | |
L01f2: nop | |
L01f3: xor eax, eax | |
L01f5: mov [rbp-0x40], rax | |
L01f9: xor eax, eax | |
L01fb: movsxd rax, eax | |
L01fe: mov [rbp-0xc0], rax | |
L0205: nop | |
L0206: jmp L0208 | |
L0208: mov rax, [rbp-0xc0] | |
L020f: vzeroupper | |
L0212: lea rsp, [rbp-0x10] | |
L0216: pop rsi | |
L0217: pop rdi | |
L0218: pop rbp | |
L0219: ret | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
public class MyClass { | |
public static unsafe ulong MyMethod(byte[] array) { | |
var primeVector = Vector256.Create(0xe7037ed1a0b428db, 0x8ebc6af09c88c6e3, 0x589965cc75374cc3, 0x1d8e4e27c47d124f); | |
fixed (byte* pData = array) | |
{ | |
byte* ptr = pData; | |
for (int i = 0; i + 32 <= array.Length; i += 32) | |
{ | |
var vector = Avx.LoadVector256((ulong*)(ptr + i)); | |
var res = Avx2.Xor(primeVector, vector); | |
ulong xor1 = res.GetElement(0); | |
ulong xor2 = res.GetElement(1); | |
ulong xor3 = res.GetElement(2); | |
ulong xor4 = res.GetElement(3); | |
} | |
} | |
return 0; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.class private auto ansi '<Module>' | |
{ | |
} // end of class <Module> | |
.class public auto ansi beforefieldinit MyClass | |
extends [System.Private.CoreLib]System.Object | |
{ | |
// Methods | |
.method public hidebysig static | |
uint64 MyMethod ( | |
uint8[] 'array' | |
) cil managed | |
{ | |
// Method begins at RVA 0x2050 | |
// Code size 176 (0xb0) | |
.maxstack 4 | |
.locals init ( | |
[0] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>, | |
[1] uint8*, | |
[2] uint8[] pinned, | |
[3] uint8*, | |
[4] int32, | |
[5] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>, | |
[6] valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>, | |
[7] uint64, | |
[8] uint64, | |
[9] uint64, | |
[10] uint64, | |
[11] bool, | |
[12] uint64 | |
) | |
IL_0000: nop | |
IL_0001: ldc.i8 -1800455987208640293 | |
IL_000a: ldc.i8 -8161530843051276573 | |
IL_0013: ldc.i8 6384245875588680899 | |
IL_001c: ldc.i8 2129725606500045391 | |
IL_0025: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::Create(uint64, uint64, uint64, uint64) | |
IL_002a: stloc.0 | |
IL_002b: ldarg.0 | |
IL_002c: dup | |
IL_002d: stloc.2 | |
IL_002e: brfalse.s IL_0035 | |
IL_0030: ldloc.2 | |
IL_0031: ldlen | |
IL_0032: conv.i4 | |
IL_0033: brtrue.s IL_003a | |
IL_0035: ldc.i4.0 | |
IL_0036: conv.u | |
IL_0037: stloc.1 | |
IL_0038: br.s IL_0043 | |
IL_003a: ldloc.2 | |
IL_003b: ldc.i4.0 | |
IL_003c: ldelema [System.Private.CoreLib]System.Byte | |
IL_0041: conv.u | |
IL_0042: stloc.1 | |
IL_0043: nop | |
IL_0044: ldloc.1 | |
IL_0045: stloc.3 | |
IL_0046: ldc.i4.0 | |
IL_0047: stloc.s 4 | |
// sequence point: hidden | |
IL_0049: br.s IL_0091 | |
// loop start (head: IL_0091) | |
IL_004b: nop | |
IL_004c: ldloc.3 | |
IL_004d: ldloc.s 4 | |
IL_004f: add | |
IL_0050: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.X86.Avx::LoadVector256(uint64*) | |
IL_0055: stloc.s 5 | |
IL_0057: ldloc.0 | |
IL_0058: ldloc.s 5 | |
IL_005a: call valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64> [System.Private.CoreLib]System.Runtime.Intrinsics.X86.Avx2::Xor(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>, valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<uint64>) | |
IL_005f: stloc.s 6 | |
IL_0061: ldloc.s 6 | |
IL_0063: ldc.i4.0 | |
IL_0064: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32) | |
IL_0069: stloc.s 7 | |
IL_006b: ldloc.s 6 | |
IL_006d: ldc.i4.1 | |
IL_006e: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32) | |
IL_0073: stloc.s 8 | |
IL_0075: ldloc.s 6 | |
IL_0077: ldc.i4.2 | |
IL_0078: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32) | |
IL_007d: stloc.s 9 | |
IL_007f: ldloc.s 6 | |
IL_0081: ldc.i4.3 | |
IL_0082: call !!0 [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256::GetElement<uint64>(valuetype [System.Private.CoreLib]System.Runtime.Intrinsics.Vector256`1<!!0>, int32) | |
IL_0087: stloc.s 10 | |
IL_0089: nop | |
IL_008a: ldloc.s 4 | |
IL_008c: ldc.i4.s 32 | |
IL_008e: add | |
IL_008f: stloc.s 4 | |
IL_0091: ldloc.s 4 | |
IL_0093: ldc.i4.s 32 | |
IL_0095: add | |
IL_0096: ldarg.0 | |
IL_0097: ldlen | |
IL_0098: conv.i4 | |
IL_0099: cgt | |
IL_009b: ldc.i4.0 | |
IL_009c: ceq | |
IL_009e: stloc.s 11 | |
// sequence point: hidden | |
IL_00a0: ldloc.s 11 | |
IL_00a2: brtrue.s IL_004b | |
// end loop | |
IL_00a4: nop | |
// sequence point: hidden | |
IL_00a5: ldnull | |
IL_00a6: stloc.2 | |
IL_00a7: ldc.i4.0 | |
IL_00a8: conv.i8 | |
IL_00a9: stloc.s 12 | |
IL_00ab: br.s IL_00ad | |
IL_00ad: ldloc.s 12 | |
IL_00af: ret | |
} // end of method MyClass::MyMethod | |
.method public hidebysig specialname rtspecialname | |
instance void .ctor () cil managed | |
{ | |
// Method begins at RVA 0x210c | |
// Code size 8 (0x8) | |
.maxstack 8 | |
IL_0000: ldarg.0 | |
IL_0001: call instance void [System.Private.CoreLib]System.Object::.ctor() | |
IL_0006: nop | |
IL_0007: ret | |
} // end of method MyClass::.ctor | |
} // end of class MyClass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment