Last active
November 20, 2024 07:57
-
-
Save minjang/b01096455fc71f50715af39d02c4b190 to your computer and use it in GitHub Desktop.
x86-64 (AVX512) for matmul_kernel (03-matrix-multiplication-cpu.py) from TTMIR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.file "LLVMDialectModule" | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 # -- Begin function matmul_kernel | |
.LCPI0_0: | |
.zero 4 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 15 # 0xf | |
.LCPI0_15: | |
.long 3 # 0x3 | |
.long 19 # 0x13 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 7 # 0x7 | |
.long 23 # 0x17 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 11 # 0xb | |
.long 27 # 0x1b | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 15 # 0xf | |
.long 31 # 0x1f | |
.long 14 # 0xe | |
.long 15 # 0xf | |
.LCPI0_16: | |
.quad 2 # 0x2 | |
.quad 10 # 0xa | |
.quad 2 # 0x2 | |
.quad 10 # 0xa | |
.quad 6 # 0x6 | |
.quad 15 # 0xf | |
.quad 6 # 0x6 | |
.quad 14 # 0xe | |
.LCPI0_17: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 18 # 0x12 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 22 # 0x16 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 26 # 0x1a | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 30 # 0x1e | |
.LCPI0_18: | |
.long 1 # 0x1 | |
.long 17 # 0x11 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 5 # 0x5 | |
.long 21 # 0x15 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 9 # 0x9 | |
.long 25 # 0x19 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 13 # 0xd | |
.long 29 # 0x1d | |
.long 14 # 0xe | |
.long 15 # 0xf | |
.LCPI0_19: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 0 # 0x0 | |
.long 16 # 0x10 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 4 # 0x4 | |
.long 20 # 0x14 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 8 # 0x8 | |
.long 24 # 0x18 | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 12 # 0xc | |
.long 28 # 0x1c | |
.LCPI0_20: | |
.quad 0 # 0x0 | |
.quad 8 # 0x8 | |
.quad 0 # 0x0 | |
.quad 8 # 0x8 | |
.quad 4 # 0x4 | |
.quad 12 # 0xc | |
.quad 4 # 0x4 | |
.quad 13 # 0xd | |
.LCPI0_40: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_47: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 13 # 0xd | |
.zero 8 | |
.zero 8 | |
.zero 8 | |
.LCPI0_48: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 26 # 0x1a | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_55: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_56: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_57: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_64: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 28 # 0x1c | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_65: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 14 # 0xe | |
.zero 8 | |
.zero 8 | |
.LCPI0_66: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 28 # 0x1c | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_73: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_74: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_75: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_76: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_77: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_84: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 15 # 0xf | |
.zero 8 | |
.zero 8 | |
.zero 8 | |
.LCPI0_85: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 30 # 0x1e | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_86: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 15 # 0xf | |
.zero 8 | |
.zero 8 | |
.LCPI0_87: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 30 # 0x1e | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_88: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 15 # 0xf | |
.zero 8 | |
.LCPI0_89: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 30 # 0x1e | |
.zero 4 | |
.zero 4 | |
.LCPI0_96: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_97: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_98: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_99: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_100: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_101: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.LCPI0_102: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 31 # 0x1f | |
.zero 4 | |
.LCPI0_109: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 16 # 0x10 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_110: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 17 # 0x11 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_111: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 17 # 0x11 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_112: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 17 # 0x11 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_113: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 9 # 0x9 | |
.zero 8 | |
.zero 8 | |
.zero 8 | |
.LCPI0_114: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 18 # 0x12 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_115: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 19 # 0x13 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_116: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 19 # 0x13 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_117: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 19 # 0x13 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_119: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 20 # 0x14 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_123: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_124: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_125: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_126: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_127: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_128: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 21 # 0x15 | |
.zero 4 | |
.zero 4 | |
.LCPI0_129: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 21 # 0x15 | |
.zero 4 | |
.LCPI0_130: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 21 # 0x15 | |
.LCPI0_134: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 11 # 0xb | |
.zero 8 | |
.zero 8 | |
.zero 8 | |
.LCPI0_135: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 22 # 0x16 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_136: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 11 # 0xb | |
.zero 8 | |
.zero 8 | |
.LCPI0_137: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 22 # 0x16 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_138: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 11 # 0xb | |
.zero 8 | |
.LCPI0_139: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 22 # 0x16 | |
.zero 4 | |
.zero 4 | |
.LCPI0_140: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 6 # 0x6 | |
.quad 11 # 0xb | |
.LCPI0_141: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 22 # 0x16 | |
.LCPI0_145: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_146: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_147: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_148: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_149: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_150: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 23 # 0x17 | |
.zero 4 | |
.zero 4 | |
.LCPI0_151: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 23 # 0x17 | |
.zero 4 | |
.LCPI0_152: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 23 # 0x17 | |
.LCPI0_159: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 24 # 0x18 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_160: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 12 # 0xc | |
.zero 8 | |
.zero 8 | |
.LCPI0_161: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 24 # 0x18 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_162: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 12 # 0xc | |
.zero 8 | |
.LCPI0_163: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 24 # 0x18 | |
.zero 4 | |
.zero 4 | |
.LCPI0_164: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 6 # 0x6 | |
.quad 12 # 0xc | |
.LCPI0_165: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 24 # 0x18 | |
.LCPI0_166: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_167: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_168: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_169: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.LCPI0_170: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 25 # 0x19 | |
.zero 4 | |
.LCPI0_171: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 25 # 0x19 | |
.LCPI0_172: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 26 # 0x1a | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_173: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 13 # 0xd | |
.zero 8 | |
.LCPI0_174: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 26 # 0x1a | |
.zero 4 | |
.zero 4 | |
.LCPI0_175: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 6 # 0x6 | |
.quad 13 # 0xd | |
.LCPI0_176: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 26 # 0x1a | |
.LCPI0_177: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_178: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.LCPI0_179: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 27 # 0x1b | |
.zero 4 | |
.LCPI0_180: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 27 # 0x1b | |
.LCPI0_181: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 28 # 0x1c | |
.zero 4 | |
.zero 4 | |
.LCPI0_182: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 3 # 0x3 | |
.quad 4 # 0x4 | |
.quad 5 # 0x5 | |
.quad 6 # 0x6 | |
.quad 14 # 0xe | |
.LCPI0_183: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 28 # 0x1c | |
.LCPI0_184: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 29 # 0x1d | |
.zero 4 | |
.LCPI0_185: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 29 # 0x1d | |
.LCPI0_186: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 10 # 0xa | |
.long 11 # 0xb | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 14 # 0xe | |
.long 30 # 0x1e | |
.LCPI0_193: | |
.zero 64 | |
.section .rodata.cst4,"aM",@progbits,4 | |
.p2align 2, 0x0 | |
.LCPI0_1: | |
.long 14 # 0xe | |
.LCPI0_2: | |
.long 13 # 0xd | |
.LCPI0_3: | |
.long 12 # 0xc | |
.LCPI0_4: | |
.long 11 # 0xb | |
.LCPI0_5: | |
.long 10 # 0xa | |
.LCPI0_6: | |
.long 8 # 0x8 | |
.LCPI0_7: | |
.long 6 # 0x6 | |
.LCPI0_8: | |
.long 4 # 0x4 | |
.LCPI0_9: | |
.long 2 # 0x2 | |
.LCPI0_103: | |
.long 15 # 0xf | |
.LCPI0_104: | |
.long 9 # 0x9 | |
.LCPI0_105: | |
.long 7 # 0x7 | |
.LCPI0_106: | |
.long 5 # 0x5 | |
.LCPI0_107: | |
.long 3 # 0x3 | |
.LCPI0_108: | |
.long 1 # 0x1 | |
.section .rodata.cst32,"aM",@progbits,32 | |
.p2align 5, 0x0 | |
.LCPI0_10: | |
.long 3 # 0x3 | |
.long 11 # 0xb | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_14: | |
.long 3 # 0x3 | |
.long 11 # 0xb | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.zero 4 | |
.zero 4 | |
.LCPI0_24: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 10 # 0xa | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 14 # 0xe | |
.LCPI0_26: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 0 # 0x0 | |
.long 8 # 0x8 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 4 # 0x4 | |
.long 12 # 0xc | |
.LCPI0_27: | |
.long 1 # 0x1 | |
.long 9 # 0x9 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_31: | |
.long 1 # 0x1 | |
.long 9 # 0x9 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.zero 4 | |
.zero 4 | |
.LCPI0_36: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_37: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 25 # 0x19 | |
.zero 4 | |
.zero 4 | |
.LCPI0_38: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 25 # 0x19 | |
.zero 4 | |
.LCPI0_39: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 25 # 0x19 | |
.LCPI0_43: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 13 # 0xd | |
.zero 8 | |
.LCPI0_44: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 26 # 0x1a | |
.zero 4 | |
.zero 4 | |
.LCPI0_45: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 13 # 0xd | |
.LCPI0_46: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 26 # 0x1a | |
.LCPI0_51: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_52: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 27 # 0x1b | |
.zero 4 | |
.zero 4 | |
.LCPI0_53: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 27 # 0x1b | |
.zero 4 | |
.LCPI0_54: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 27 # 0x1b | |
.LCPI0_60: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 14 # 0xe | |
.zero 8 | |
.LCPI0_61: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 28 # 0x1c | |
.zero 4 | |
.zero 4 | |
.LCPI0_62: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 14 # 0xe | |
.LCPI0_63: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 28 # 0x1c | |
.LCPI0_69: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_70: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 29 # 0x1d | |
.zero 4 | |
.zero 4 | |
.LCPI0_71: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 29 # 0x1d | |
.zero 4 | |
.LCPI0_72: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 29 # 0x1d | |
.LCPI0_80: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 15 # 0xf | |
.zero 8 | |
.LCPI0_81: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 30 # 0x1e | |
.zero 4 | |
.zero 4 | |
.LCPI0_82: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 15 # 0xf | |
.LCPI0_83: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 30 # 0x1e | |
.LCPI0_92: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_93: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 31 # 0x1f | |
.zero 4 | |
.zero 4 | |
.LCPI0_94: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 31 # 0x1f | |
.zero 4 | |
.LCPI0_95: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 31 # 0x1f | |
.LCPI0_118: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 6 # 0x6 | |
.LCPI0_120: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 13 # 0xd | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_121: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.zero 4 | |
.LCPI0_122: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 13 # 0xd | |
.LCPI0_131: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 7 # 0x7 | |
.zero 8 | |
.LCPI0_132: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 14 # 0xe | |
.zero 4 | |
.zero 4 | |
.LCPI0_133: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 14 # 0xe | |
.LCPI0_142: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 15 # 0xf | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.LCPI0_143: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 15 # 0xf | |
.zero 4 | |
.zero 4 | |
.LCPI0_144: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 15 # 0xf | |
.zero 4 | |
.LCPI0_155: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 12 # 0xc | |
.zero 8 | |
.LCPI0_156: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 24 # 0x18 | |
.zero 4 | |
.zero 4 | |
.LCPI0_157: | |
.quad 0 # 0x0 | |
.quad 1 # 0x1 | |
.quad 2 # 0x2 | |
.quad 12 # 0xc | |
.LCPI0_158: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 3 # 0x3 | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 24 # 0x18 | |
.LCPI0_187: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.long 4 # 0x4 | |
.long 12 # 0xc | |
.LCPI0_188: | |
.long 1 # 0x1 | |
.long 9 # 0x9 | |
.zero 4 | |
.zero 4 | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.zero 4 | |
.zero 4 | |
.LCPI0_189: | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.zero 4 | |
.zero 4 | |
.LCPI0_190: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.long 6 # 0x6 | |
.long 14 # 0xe | |
.LCPI0_191: | |
.long 3 # 0x3 | |
.long 11 # 0xb | |
.zero 4 | |
.zero 4 | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.zero 4 | |
.zero 4 | |
.LCPI0_192: | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.zero 4 | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.zero 4 | |
.zero 4 | |
.section .rodata.cst16,"aM",@progbits,16 | |
.p2align 4, 0x0 | |
.LCPI0_11: | |
.long 7 # 0x7 | |
.long 23 # 0x17 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_12: | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_21: | |
.quad 0 # 0x0 | |
.quad 8 # 0x8 | |
.LCPI0_22: | |
.zero 4 | |
.zero 4 | |
.long 4 # 0x4 | |
.long 0 # 0x0 | |
.LCPI0_23: | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 6 # 0x6 | |
.long 22 # 0x16 | |
.LCPI0_25: | |
.long 4 # 0x4 | |
.long 5 # 0x5 | |
.long 4 # 0x4 | |
.long 20 # 0x14 | |
.LCPI0_28: | |
.long 5 # 0x5 | |
.long 21 # 0x15 | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_29: | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.long 6 # 0x6 | |
.long 7 # 0x7 | |
.LCPI0_33: | |
.long 3 # 0x3 | |
.long 7 # 0x7 | |
.zero 4 | |
.zero 4 | |
.LCPI0_34: | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 25 # 0x19 | |
.zero 4 | |
.LCPI0_35: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 25 # 0x19 | |
.LCPI0_41: | |
.quad 4 # 0x4 | |
.quad 13 # 0xd | |
.LCPI0_42: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 26 # 0x1a | |
.LCPI0_49: | |
.long 8 # 0x8 | |
.long 9 # 0x9 | |
.long 27 # 0x1b | |
.zero 4 | |
.LCPI0_50: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 27 # 0x1b | |
.LCPI0_58: | |
.quad 6 # 0x6 | |
.quad 14 # 0xe | |
.LCPI0_59: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 28 # 0x1c | |
.LCPI0_67: | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 29 # 0x1d | |
.zero 4 | |
.LCPI0_68: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 29 # 0x1d | |
.LCPI0_78: | |
.quad 6 # 0x6 | |
.quad 15 # 0xf | |
.LCPI0_79: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 30 # 0x1e | |
.LCPI0_90: | |
.long 12 # 0xc | |
.long 13 # 0xd | |
.long 31 # 0x1f | |
.zero 4 | |
.LCPI0_91: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 31 # 0x1f | |
.LCPI0_153: | |
.quad 4 # 0x4 | |
.quad 12 # 0xc | |
.LCPI0_154: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 2 # 0x2 | |
.long 24 # 0x18 | |
.LCPI0_195: | |
.long 3 # 0x3 | |
.long 7 # 0x7 | |
.long 0 # 0x0 | |
.long 0 # 0x0 | |
.LCPI0_196: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 4 # 0x4 | |
.long 12 # 0xc | |
.LCPI0_197: | |
.long 0 # 0x0 | |
.long 1 # 0x1 | |
.long 6 # 0x6 | |
.long 14 # 0xe | |
.section .rodata.cst8,"aM",@progbits,8 | |
.p2align 3, 0x0 | |
.LCPI0_13: | |
.long 7 # 0x7 | |
.long 15 # 0xf | |
.LCPI0_30: | |
.long 5 # 0x5 | |
.long 13 # 0xd | |
.LCPI0_32: | |
.long 3 # 0x3 | |
.long 19 # 0x13 | |
.LCPI0_194: | |
.long 4 # 0x4 | |
.long 0 # 0x0 | |
.text | |
.globl matmul_kernel | |
.p2align 4, 0x90 | |
.type matmul_kernel,@function | |
matmul_kernel: # @matmul_kernel | |
.Lfunc_begin0: | |
.file 1 "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" "03-matrix-multiplication-cpu.py" | |
.loc 1 166 0 # 03-matrix-multiplication-cpu.py:166:0 | |
.cfi_sections .debug_frame | |
.cfi_startproc | |
# %bb.0: | |
pushq %rbp | |
.cfi_def_cfa_offset 16 | |
pushq %r15 | |
.cfi_def_cfa_offset 24 | |
pushq %r14 | |
.cfi_def_cfa_offset 32 | |
pushq %r13 | |
.cfi_def_cfa_offset 40 | |
pushq %r12 | |
.cfi_def_cfa_offset 48 | |
pushq %rbx | |
.cfi_def_cfa_offset 56 | |
subq $3448, %rsp # imm = 0xD78 | |
.cfi_def_cfa_offset 3504 | |
.cfi_offset %rbx, -56 | |
.cfi_offset %r12, -48 | |
.cfi_offset %r13, -40 | |
.cfi_offset %r14, -32 | |
.cfi_offset %r15, -24 | |
.cfi_offset %rbp, -16 | |
# kill: def $ecx killed $ecx def $rcx | |
.Ltmp0: | |
.file 2 "/data/users/minjang/triton-oss/triton-cpu/python/triton/language" "standard.py" | |
.loc 2 40 22 prologue_end # standard.py:40:22 | |
leal 15(%rcx), %eax | |
movl 3528(%rsp), %ebp | |
.loc 2 40 28 is_stmt 0 # standard.py:40:28 | |
leal 30(%rcx), %r11d | |
.Ltmp1: | |
# kill: def $r8d killed $r8d def $r8 | |
.loc 2 40 28 # standard.py:40:28 | |
leal 30(%r8), %r10d | |
movq %rdx, 496(%rsp) # 8-byte Spill | |
movl $8, %ebx | |
.Ltmp2: | |
# kill: def $r9d killed $r9d def $r9 | |
.loc 2 40 28 # standard.py:40:28 | |
testl %eax, %eax | |
cmovnsl %eax, %r11d | |
.Ltmp3: | |
.loc 2 40 22 # standard.py:40:22 | |
leal 15(%r8), %eax | |
.Ltmp4: | |
.loc 2 40 28 # standard.py:40:28 | |
sarl $4, %r11d | |
.Ltmp5: | |
.loc 2 40 28 # standard.py:40:28 | |
testl %eax, %eax | |
cmovnsl %eax, %r10d | |
sarl $4, %r10d | |
.Ltmp6: | |
.loc 1 192 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:192:22 | |
movl %ebp, %eax | |
cltd | |
.loc 1 191 38 # 03-matrix-multiplication-cpu.py:191:38 | |
shll $3, %r10d | |
.loc 1 192 22 # 03-matrix-multiplication-cpu.py:192:22 | |
idivl %r10d | |
movl %eax, %r14d | |
.loc 1 193 29 # 03-matrix-multiplication-cpu.py:193:29 | |
leal (,%r14,8), %eax | |
.loc 1 194 35 # 03-matrix-multiplication-cpu.py:194:35 | |
subl %eax, %r11d | |
.loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33 | |
movl %ebp, %eax | |
cltd | |
.loc 1 194 48 # 03-matrix-multiplication-cpu.py:194:48 | |
cmpl $8, %r11d | |
cmovll %r11d, %ebx | |
.loc 1 196 19 # 03-matrix-multiplication-cpu.py:196:19 | |
imull %r14d, %r10d | |
.loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33 | |
idivl %ebx | |
# kill: def $edx killed $edx def $rdx | |
.loc 1 195 27 is_stmt 0 # 03-matrix-multiplication-cpu.py:195:27 | |
leal (%rdx,%r14,8), %r15d | |
.loc 1 196 19 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:19 | |
subl %r10d, %ebp | |
.loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38 | |
vpbroadcastd %r15d, %zmm0 | |
.loc 1 205 23 is_stmt 0 # 03-matrix-multiplication-cpu.py:205:23 | |
shll $4, %r15d | |
.loc 1 196 40 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:40 | |
movl %ebp, %eax | |
cltd | |
.loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38 | |
vpslld $4, %zmm0, %zmm0 | |
vpord .LCPI0_0(%rip), %zmm0, %zmm2 | |
.loc 1 196 40 # 03-matrix-multiplication-cpu.py:196:40 | |
idivl %ebx | |
.Ltmp7: | |
.loc 2 40 22 # standard.py:40:22 | |
leal 15(%r9), %edx | |
.loc 2 40 28 is_stmt 0 # standard.py:40:28 | |
leal 30(%r9), %ebx | |
movq %r8, 488(%rsp) # 8-byte Spill | |
movq %rcx, 480(%rsp) # 8-byte Spill | |
movl %r15d, -112(%rsp) # 4-byte Spill | |
vextracti32x4 $3, %zmm2, %xmm1 | |
vextracti32x4 $2, %zmm2, %xmm3 | |
vmovdqu64 %zmm2, 3248(%rsp) # 64-byte Spill | |
vmovdqa %xmm1, 1408(%rsp) # 16-byte Spill | |
vmovdqa %xmm3, 1392(%rsp) # 16-byte Spill | |
.Ltmp8: | |
.loc 1 206 23 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:23 | |
shll $4, %eax | |
.Ltmp9: | |
.loc 2 40 28 # standard.py:40:28 | |
testl %edx, %edx | |
cmovnsl %edx, %ebx | |
movl %eax, -108(%rsp) # 4-byte Spill | |
.Ltmp10: | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
cmpl $16, %edx | |
jl .LBB0_1 | |
# %bb.2: # %.lr.ph | |
.loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68 | |
cltd | |
.loc 1 0 0 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:0 | |
sarl $4, %ebx | |
vxorpd %xmm30, %xmm30, %xmm30 | |
vpxord %xmm28, %xmm28, %xmm28 | |
vpxord %xmm29, %xmm29, %xmm29 | |
vpxord %xmm22, %xmm22, %xmm22 | |
.loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68 | |
idivl %r8d | |
.loc 1 205 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:205:68 | |
vpextrd $3, %xmm1, %eax | |
movl %edx, 96(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $2, %xmm1, %eax | |
vpinsrd $3, %edx, %xmm0, %xmm0 | |
cltd | |
idivl %ecx | |
vpextrd $1, %xmm1, %eax | |
movl %edx, 288(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vmovd %xmm1, %eax | |
vextracti128 $1, %ymm2, %xmm1 | |
movl %edx, 32(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $3, %xmm3, %eax | |
movl %edx, 224(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $2, %xmm3, %eax | |
movl %edx, 416(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $1, %xmm3, %eax | |
movl %edx, 688(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vmovd %xmm3, %eax | |
movl %edx, 624(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $3, %xmm1, %eax | |
movl %edx, 560(%rsp) # 4-byte Spill | |
cltd | |
idivl %ecx | |
vpextrd $2, %xmm1, %eax | |
movl %edx, %ebp | |
cltd | |
idivl %ecx | |
vpextrd $1, %xmm1, %eax | |
movl %edx, %r11d | |
cltd | |
idivl %ecx | |
vmovd %xmm1, %eax | |
movl %edx, %r10d | |
cltd | |
idivl %ecx | |
vpextrd $3, %xmm2, %eax | |
movl %edx, %r14d | |
cltd | |
idivl %ecx | |
vpextrd $2, %xmm2, %eax | |
movl %edx, %r8d | |
cltd | |
idivl %ecx | |
vpextrd $1, %xmm2, %eax | |
movl %edx, %r12d | |
cltd | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r12d, %xmm3 | |
.loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68 | |
idivl %ecx | |
movl %r15d, %eax | |
movl %edx, %r13d | |
cltd | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r13d, %xmm2 | |
movl 3504(%rsp), %r13d | |
.loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68 | |
idivl %ecx | |
movl 3512(%rsp), %eax | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %edx, %xmm1 | |
.loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40 | |
vpbroadcastd %eax, %zmm5 | |
vpmulld .LCPI0_1(%rip){1to16}, %zmm5, %zmm9 | |
vpmulld .LCPI0_2(%rip){1to16}, %zmm5, %zmm11 | |
vpmulld .LCPI0_3(%rip){1to16}, %zmm5, %zmm12 | |
vpmulld .LCPI0_4(%rip){1to16}, %zmm5, %zmm13 | |
.loc 1 227 33 # 03-matrix-multiplication-cpu.py:227:33 | |
shll $4, %eax | |
.loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40 | |
vpmulld .LCPI0_5(%rip){1to16}, %zmm5, %zmm14 | |
vpmulld .LCPI0_6(%rip){1to16}, %zmm5, %zmm15 | |
vpmulld .LCPI0_8(%rip){1to16}, %zmm5, %zmm17 | |
vpmulld .LCPI0_7(%rip){1to16}, %zmm5, %zmm16 | |
vpmulld .LCPI0_9(%rip){1to16}, %zmm5, %zmm10 | |
vpslld $4, %zmm5, %zmm4 | |
movl %eax, 160(%rsp) # 4-byte Spill | |
movl 96(%rsp), %eax # 4-byte Reload | |
vpslld $3, %zmm5, %zmm6 | |
vpslld $2, %zmm5, %zmm8 | |
vpsubd %zmm5, %zmm4, %zmm4 | |
vpaddd %zmm6, %zmm5, %zmm7 | |
vpsubd %zmm5, %zmm6, %zmm6 | |
vpaddd %zmm8, %zmm5, %zmm8 | |
.loc 1 209 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:52 | |
valignd $15, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm6, %zmm6, %zmm19 # zmm19 = zmm6[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm7, %zmm7, %zmm20 # zmm20 = zmm7[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm8, %zmm8, %zmm18 # zmm18 = zmm8[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm9, %zmm9, %zmm6 # zmm6 = zmm9[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm11, %zmm11, %zmm7 # zmm7 = zmm11[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm12, %zmm12, %zmm8 # zmm8 = zmm12[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
.loc 1 206 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:68 | |
vmovd %eax, %xmm12 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
valignd $15, %zmm13, %zmm13, %zmm13 # zmm13 = zmm13[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm14, %zmm14, %zmm14 # zmm14 = zmm14[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm17, %zmm17, %zmm21 # zmm21 = zmm17[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm16, %zmm16, %zmm17 # zmm17 = zmm16[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
valignd $15, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
cltq | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vpaddd %xmm4, %xmm12, %xmm9 | |
movq %rax, -96(%rsp) # 8-byte Spill | |
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r8d, %xmm4 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -96(%rsp) # 8-byte Folded Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm9, %r15d | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %r15d, %rcx | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vpaddd %xmm6, %xmm12, %xmm9 | |
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r14d, %xmm6 | |
movq %rcx, 24(%rsp) # 8-byte Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vpaddd %xmm13, %xmm12, %xmm11 | |
vpaddd %xmm14, %xmm12, %xmm13 | |
vpaddd %xmm10, %xmm12, %xmm10 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, 24(%rsp) # 8-byte Folded Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm9, %r14d | |
vpaddd %xmm7, %xmm12, %xmm9 | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r10d, %xmm7 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm9, %r10d | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %r14d, %rcx | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vpaddd %xmm8, %xmm12, %xmm9 | |
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r11d, %xmm8 | |
movq %rcx, -8(%rsp) # 8-byte Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm9, %ecx | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ebp, %xmm9 | |
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %r10d, %r10 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -8(%rsp) # 8-byte Folded Spill | |
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r10 | |
movq %rcx, -16(%rsp) # 8-byte Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm11, %ecx | |
movq %r10, 552(%rsp) # 8-byte Spill | |
movl %ebx, %r10d | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -16(%rsp) # 8-byte Folded Spill | |
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %r11 | |
movl 560(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r11 | |
movq %r11, 544(%rsp) # 8-byte Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm11 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm20, %xmm12, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, -24(%rsp) # 8-byte Spill | |
movl 624(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -24(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm14 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm15, %xmm12, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, 16(%rsp) # 8-byte Spill | |
movl 688(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, 16(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm15 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm19, %xmm12, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, -40(%rsp) # 8-byte Spill | |
movl 416(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -40(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm16 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm17, %xmm12, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, 8(%rsp) # 8-byte Spill | |
movl 224(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, 8(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm17 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm18, %xmm12, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, -48(%rsp) # 8-byte Spill | |
movl 32(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -48(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm18 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm21, %xmm12, %xmm13 | |
vpxord %xmm21, %xmm21, %xmm21 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
movq %rcx, (%rsp) # 8-byte Spill | |
movl 288(%rsp), %ecx # 4-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, (%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %ecx, %xmm19 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm13, %ecx | |
vpaddd %xmm5, %xmm5, %xmm13 | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vpaddd %xmm5, %xmm13, %xmm13 | |
vpaddd %xmm5, %xmm12, %xmm5 | |
movq %rcx, -56(%rsp) # 8-byte Spill | |
movb $-64, %cl | |
vpaddd %xmm13, %xmm12, %xmm20 | |
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
vpbroadcastd %r13d, %xmm13 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -56(%rsp) # 8-byte Folded Spill | |
vpxor %xmm12, %xmm12, %xmm12 | |
kmovd %ecx, %k4 | |
movw $512, %cx # imm = 0x200 | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpmulld %xmm13, %xmm0, %xmm0 | |
vpmulld %xmm13, %xmm3, %xmm3 | |
vpmulld %xmm13, %xmm2, %xmm2 | |
kmovd %ecx, %k1 | |
movb $32, %cl | |
kmovw %k1, -114(%rsp) # 2-byte Spill | |
kmovd %ecx, %k1 | |
movw $2048, %cx # imm = 0x800 | |
kmovw %k1, -116(%rsp) # 2-byte Spill | |
kmovd %ecx, %k1 | |
movb $64, %cl | |
vpextrd $3, %xmm3, %ebp | |
vbroadcasti32x4 .LCPI0_11(%rip), %zmm3 # zmm3 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7] | |
# zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
kmovw %k1, -118(%rsp) # 2-byte Spill | |
kmovd %ecx, %k1 | |
movw $8192, %cx # imm = 0x2000 | |
kmovw %k1, -120(%rsp) # 2-byte Spill | |
kmovd %ecx, %k1 | |
movb $-128, %cl | |
kmovw %k1, -122(%rsp) # 2-byte Spill | |
kmovd %ecx, %k1 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm20, %ecx | |
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %r13 | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm10, %ecx | |
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41 | |
vpmulld %xmm13, %xmm19, %xmm10 | |
vpxord %xmm19, %xmm19, %xmm19 | |
kmovw %k1, -124(%rsp) # 2-byte Spill | |
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rcx | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r13 | |
movq %rcx, -64(%rsp) # 8-byte Spill | |
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52 | |
vmovd %xmm5, %ecx | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpmulld %xmm13, %xmm18, %xmm5 | |
movq %r13, 536(%rsp) # 8-byte Spill | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -64(%rsp) # 8-byte Folded Spill | |
vpxord %xmm18, %xmm18, %xmm18 | |
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22 | |
movslq %ecx, %rdx | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm0, %ecx | |
vbroadcasti32x4 .LCPI0_28(%rip), %zmm0 # zmm0 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7] | |
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
.loc 1 208 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm10, %ecx | |
vpmulld %xmm13, %xmm17, %xmm10 | |
vmovdqu64 %zmm3, 2352(%rsp) # 64-byte Spill | |
vbroadcasti32x4 .LCPI0_23(%rip), %zmm3 # zmm3 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22] | |
# zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %rdx | |
movq %rax, -72(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %ecx | |
vpmulld %xmm13, %xmm16, %xmm5 | |
movq %rdx, 528(%rsp) # 8-byte Spill | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -72(%rsp) # 8-byte Folded Spill | |
vpxord %xmm16, %xmm16, %xmm16 | |
movq %rax, -88(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -88(%rsp) # 8-byte Folded Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm10, %ecx | |
vpmulld %xmm13, %xmm15, %xmm10 | |
movq %rax, -80(%rsp) # 8-byte Spill | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -80(%rsp) # 8-byte Folded Spill | |
vpxor %xmm15, %xmm15, %xmm15 | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %r15 | |
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %ecx | |
vpmulld %xmm13, %xmm14, %xmm5 | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %r12 | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r15 | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm10, %ecx | |
vpmulld %xmm13, %xmm11, %xmm10 | |
movq %r15, 520(%rsp) # 8-byte Spill | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r12 | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %ecx | |
vpmulld %xmm13, %xmm9, %xmm5 | |
vmovdqu64 %zmm0, 2096(%rsp) # 64-byte Spill | |
vbroadcasti128 .LCPI0_29(%rip), %ymm0 # ymm0 = [5,13,6,7,5,13,6,7] | |
# ymm0 = mem[0,1,0,1] | |
movq %r12, 512(%rsp) # 8-byte Spill | |
vpxor %xmm9, %xmm9, %xmm9 | |
movl $65535, %r12d # imm = 0xFFFF | |
movq %rax, -32(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -32(%rsp) # 8-byte Folded Spill | |
vmovdqu64 %zmm3, 2224(%rsp) # 64-byte Spill | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm10, %ecx | |
movq %rax, -104(%rsp) # 8-byte Spill | |
vpmulld %xmm13, %xmm8, %xmm10 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, -104(%rsp) # 8-byte Folded Spill | |
vpxor %xmm8, %xmm8, %xmm8 | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %ecx | |
vpmulld %xmm13, %xmm7, %xmm5 | |
vpmulld %xmm13, %xmm6, %xmm7 | |
vpxor %xmm6, %xmm6, %xmm6 | |
movq %rax, 96(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm10, %ecx | |
movq %rax, 32(%rsp) # 8-byte Spill | |
movq 96(%rsp), %rbx # 8-byte Reload | |
vpextrd $3, %xmm5, %r8d | |
vpmulld %xmm13, %xmm4, %xmm5 | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ecx, %rax | |
vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7] | |
# ymm4 = mem[0,1,0,1] | |
movq %rax, 224(%rsp) # 8-byte Spill | |
movslq %r8d, %rax | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm7, %r8d | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %r8d, %r14 | |
movq %rax, 416(%rsp) # 8-byte Spill | |
movq 224(%rsp), %r13 # 8-byte Reload | |
vmovdqu %ymm0, 784(%rsp) # 32-byte Spill | |
vpxor %xmm0, %xmm0, %xmm0 | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %r8d | |
vpmulld %xmm13, %xmm1, %xmm5 | |
movq 416(%rsp), %r11 # 8-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r14 | |
vpxor %xmm1, %xmm1, %xmm1 | |
vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %r8d, %rax | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %rbx | |
movslq 160(%rsp), %r8 # 4-byte Folded Reload | |
movq %rax, 288(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ebp, %rax | |
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm2, %ebp | |
vbroadcasti32x4 .LCPI0_21(%rip), %zmm2 # zmm2 = [0,8,0,8,0,8,0,8] | |
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
movq %rbx, %r15 | |
movl %r10d, %ebx | |
movq %rax, 624(%rsp) # 8-byte Spill | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ebp, %rax | |
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41 | |
vpextrd $3, %xmm5, %ebp | |
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22 | |
movslq %ebp, %rcx | |
movq 624(%rsp), %rbp # 8-byte Reload | |
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22 | |
shlq $2, %r13 | |
shlq $2, %rax | |
movq %rcx, 688(%rsp) # 8-byte Spill | |
movq 32(%rsp), %rcx # 8-byte Reload | |
shlq $2, %r11 | |
vmovdqu %ymm4, 752(%rsp) # 32-byte Spill | |
vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill | |
movq 688(%rsp), %r10 # 8-byte Reload | |
shlq $2, %r8 | |
movq %r8, 504(%rsp) # 8-byte Spill | |
movq %rax, %r8 | |
shlq $2, %rbp | |
vmovdqu64 %zmm2, 2288(%rsp) # 64-byte Spill | |
vbroadcasti32x4 .LCPI0_25(%rip), %zmm2 # zmm2 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20] | |
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
shlq $2, %rcx | |
movq %rcx, %rdx | |
movq 288(%rsp), %rcx # 8-byte Reload | |
shlq $2, %r10 | |
shlq $2, %rcx | |
vmovdqu64 %zmm2, 2160(%rsp) # 64-byte Spill | |
.loc 1 0 22 is_stmt 0 # :0:22 | |
.Ltmp11: | |
.p2align 4, 0x90 | |
.LBB0_3: # =>This Inner Loop Header: Depth=1 | |
vmovdqa64 %zmm29, %zmm26 | |
vpunpckldq %ymm19, %ymm26, %ymm0 # ymm0 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5] | |
vmovups 2352(%rsp), %zmm7 # 64-byte Reload | |
vmovdqa64 %ymm26, %ymm2 | |
vmovdqa64 %zmm9, %zmm27 | |
vmovapd %ymm30, %ymm23 | |
vmovdqa64 %zmm6, %zmm17 | |
vmovdqa64 %zmm21, %zmm24 | |
vmovdqa64 %zmm15, %zmm31 | |
vpunpckhdq %ymm31, %ymm24, %ymm15 # ymm15 = ymm24[2],ymm31[2],ymm24[3],ymm31[3],ymm24[6],ymm31[6],ymm24[7],ymm31[7] | |
vpunpckhdq %ymm18, %ymm22, %ymm20 # ymm20 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7] | |
vpunpckhdq %ymm19, %ymm26, %ymm3 # ymm3 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7] | |
vmovdqa64 %zmm29, %zmm4 | |
vmovapd %zmm30, %zmm29 | |
vpunpckldq %ymm27, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm27[0],ymm12[1],ymm27[1],ymm12[4],ymm27[4],ymm12[5],ymm27[5] | |
vmovdqa64 %zmm12, %zmm25 | |
vmovdqa64 %ymm26, %ymm5 | |
vmovdqa64 %zmm12, %zmm11 | |
vmovdqu64 %zmm1, 160(%rsp) # 64-byte Spill | |
vmovdqa64 %zmm12, %zmm13 | |
.loc 1 221 51 is_stmt 1 # 03-matrix-multiplication-cpu.py:221:51 | |
testl %r9d, %r9d | |
.loc 1 221 20 is_stmt 0 # 03-matrix-multiplication-cpu.py:221:20 | |
movl $0, %eax | |
vmovdqu %ymm0, 96(%rsp) # 32-byte Spill | |
vmovaps .LCPI0_10(%rip), %ymm0 # ymm0 = [3,11,2,3,7,15,6,7] | |
cmovgl %r12d, %eax | |
kmovd %eax, %k2 | |
movq -104(%rsp), %rax # 8-byte Reload | |
vpermt2ps %ymm19, %ymm0, %ymm2 | |
vpunpckhdq %zmm9, %zmm12, %zmm0 # zmm0 = zmm12[2],zmm9[2],zmm12[3],zmm9[3],zmm12[6],zmm9[6],zmm12[7],zmm9[7],zmm12[10],zmm9[10],zmm12[11],zmm9[11],zmm12[14],zmm9[14],zmm12[15],zmm9[15] | |
vpunpckhdq %ymm27, %ymm12, %ymm9 # ymm9 = ymm12[2],ymm27[2],ymm12[3],ymm27[3],ymm12[6],ymm27[6],ymm12[7],ymm27[7] | |
vshuff64x2 $85, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[2,3,2,3,2,3,2,3] | |
vinserti64x4 $1, %ymm20, %zmm0, %zmm20 | |
vmovups %zmm0, 352(%rsp) # 64-byte Spill | |
vinsertf64x4 $1, %ymm2, %zmm0, %zmm14 | |
vmovdqa64 %zmm12, %zmm2 | |
vpermt2ps %zmm27, %zmm7, %zmm2 | |
vmovups 752(%rsp), %ymm7 # 32-byte Reload | |
vmovapd %zmm30, %zmm0 | |
vshuff64x2 $85, %zmm9, %zmm9, %zmm9 # zmm9 = zmm9[2,3,2,3,2,3,2,3] | |
vmovups 224(%rsp), %zmm30 # 64-byte Reload | |
vshuff64x2 $85, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm15, %zmm2, %zmm2 # zmm2 = zmm2[0],zmm15[0],zmm2[2],zmm15[2],zmm2[4],zmm15[5],zmm2[6],zmm15[6] | |
vmovups 2224(%rsp), %zmm15 # 64-byte Reload | |
vshufpd $128, %zmm20, %zmm14, %zmm2 {%k4} # zmm2 {%k4} = zmm14[0],zmm20[0],zmm14[2],zmm20[2],zmm14[4],zmm20[4],zmm14[6],zmm20[7] | |
vmovdqa %ymm1, %ymm14 | |
vpermt2ps %ymm6, %ymm7, %ymm23 | |
vmovaps .LCPI0_14(%rip), %ymm6 # ymm6 = [3,11,2,3,7,15,u,u] | |
vextractf32x4 $1, %ymm23, %xmm7 | |
vpermt2ps %ymm8, %ymm6, %ymm14 | |
vpunpckhdq %ymm16, %ymm28, %ymm6 # ymm6 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7] | |
vblendps $192, %ymm6, %ymm14, %ymm6 # ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] | |
vmovdqa64 %zmm21, %zmm14 | |
vmovaps %zmm8, %zmm21 | |
vmovaps .LCPI0_24(%rip), %ymm8 # ymm8 = [0,1,2,10,4,5,6,14] | |
vpermt2ps %zmm31, %zmm15, %zmm14 | |
vmovdqa64 %zmm26, %zmm15 | |
vpunpckldq %xmm27, %xmm12, %xmm26 # xmm26 = xmm12[0],xmm27[0],xmm12[1],xmm27[1] | |
vshufpd $32, %zmm14, %zmm9, %zmm20 # zmm20 = zmm9[0],zmm14[0],zmm9[2],zmm14[2],zmm9[4],zmm14[5],zmm9[6],zmm14[6] | |
vmovdqa64 %ymm22, %ymm9 | |
vinsertf64x4 $1, %ymm3, %zmm0, %zmm14 | |
vbroadcastsd .LCPI0_13(%rip), %ymm3 # ymm3 = [7,15,7,15,7,15,7,15] | |
vpermt2ps %ymm18, %ymm8, %ymm9 | |
vmovaps .LCPI0_15(%rip), %zmm8 # zmm8 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
vinsertf64x4 $1, %ymm9, %zmm0, %zmm9 | |
vshufpd $128, %zmm9, %zmm14, %zmm20 {%k4} # zmm20 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[7] | |
vmovups 32(%rsp), %zmm14 # 64-byte Reload | |
vmovupd %zmm20, 288(%rsp) # 64-byte Spill | |
vpermt2ps %zmm27, %zmm8, %zmm25 | |
vmovaps %ymm14, %ymm9 | |
vpermt2ps %ymm30, %ymm3, %ymm9 | |
vmovups 2160(%rsp), %zmm3 # 64-byte Reload | |
vunpckhps %zmm30, %zmm14, %zmm23 # zmm23 = zmm14[2],zmm30[2],zmm14[3],zmm30[3],zmm14[6],zmm30[6],zmm14[7],zmm30[7],zmm14[10],zmm30[10],zmm14[11],zmm30[11],zmm14[14],zmm30[14],zmm14[15],zmm30[15] | |
vmovups %zmm23, 1584(%rsp) # 64-byte Spill | |
vblendps $3, %xmm7, %xmm9, %xmm7 # xmm7 = xmm7[0,1],xmm9[2,3] | |
vmovdqa64 %zmm24, %zmm9 | |
vblendps $15, %ymm7, %ymm6, %ymm6 # ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] | |
vpunpckldq %ymm31, %ymm24, %ymm7 # ymm7 = ymm24[0],ymm31[0],ymm24[1],ymm31[1],ymm24[4],ymm31[4],ymm24[5],ymm31[5] | |
vinsertf64x4 $0, %ymm6, %zmm2, %zmm2 | |
vpermt2ps %zmm31, %zmm3, %zmm9 | |
vmovaps .LCPI0_26(%rip), %ymm3 # ymm3 = [0,1,0,8,4,5,4,12] | |
vmovupd %zmm2, 1072(%rsp) # 64-byte Spill | |
vmovdqa64 %zmm24, %zmm2 | |
vshufpd $32, %zmm9, %zmm10, %zmm20 # zmm20 = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[5],zmm10[6],zmm9[6] | |
vinsertf64x4 $1, 96(%rsp), %zmm0, %zmm10 # 32-byte Folded Reload | |
vmovdqa64 %ymm22, %ymm9 | |
vpermt2ps %ymm18, %ymm3, %ymm9 | |
vinsertf64x4 $1, %ymm9, %zmm0, %zmm9 | |
vpermt2ps %zmm17, %zmm8, %zmm0 | |
vshufpd $128, %zmm9, %zmm10, %zmm20 {%k4} # zmm20 {%k4} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[7] | |
vpunpckhdq %zmm31, %zmm24, %zmm9 # zmm9 = zmm24[2],zmm31[2],zmm24[3],zmm31[3],zmm24[6],zmm31[6],zmm24[7],zmm31[7],zmm24[10],zmm31[10],zmm24[11],zmm31[11],zmm24[14],zmm31[14],zmm24[15],zmm31[15] | |
vshufpd $32, %zmm9, %zmm25, %zmm3 # zmm3 = zmm25[0],zmm9[0],zmm25[2],zmm9[2],zmm25[4],zmm9[5],zmm25[6],zmm9[6] | |
vmovupd %zmm20, 560(%rsp) # 64-byte Spill | |
vmovdqa64 %zmm15, %zmm20 | |
vmovupd %zmm3, 96(%rsp) # 64-byte Spill | |
vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14] | |
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
vmovups %zmm0, 1776(%rsp) # 64-byte Spill | |
vpermt2pd %zmm9, %zmm3, %zmm25 | |
vmovapd %zmm3, %zmm10 | |
vmovaps .LCPI0_27(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,6,7] | |
vextractf32x4 $3, %zmm0, %xmm9 | |
vmovupd %zmm25, 416(%rsp) # 64-byte Spill | |
vmovaps %zmm8, %zmm25 | |
vpermt2ps %zmm19, %zmm25, %zmm20 | |
vunpcklps %zmm17, %zmm29, %zmm25 # zmm25 = zmm29[0],zmm17[0],zmm29[1],zmm17[1],zmm29[4],zmm17[4],zmm29[5],zmm17[5],zmm29[8],zmm17[8],zmm29[9],zmm17[9],zmm29[12],zmm17[12],zmm29[13],zmm17[13] | |
vpermt2ps %ymm19, %ymm3, %ymm5 | |
vmovups 2096(%rsp), %zmm3 # 64-byte Reload | |
vinsertf64x4 $1, %ymm5, %zmm0, %zmm5 | |
vpermt2ps %zmm27, %zmm3, %zmm11 | |
vshufpd $32, %zmm7, %zmm11, %zmm3 # zmm3 = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[5],zmm11[6],zmm7[6] | |
vpunpckldq %ymm18, %ymm22, %ymm7 # ymm7 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5] | |
vpunpckldq %xmm19, %xmm15, %xmm11 # xmm11 = xmm15[0],xmm19[0],xmm15[1],xmm19[1] | |
vinsertf64x4 $1, %ymm7, %zmm0, %zmm7 | |
vmovups 160(%rsp), %zmm0 # 64-byte Reload | |
vshufpd $128, %zmm7, %zmm5, %zmm3 {%k4} # zmm3 {%k4} = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[7] | |
vmovdqa64 %zmm24, %zmm7 | |
vextractf32x4 $3, %zmm23, %xmm5 | |
vmovdqa64 %zmm12, %zmm23 | |
vmovupd %zmm3, 1328(%rsp) # 64-byte Spill | |
vbroadcastsd .LCPI0_32(%rip), %zmm3 # zmm3 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
vpermt2ps %zmm21, %zmm8, %zmm0 | |
vblendps $3, %xmm9, %xmm5, %xmm8 # xmm8 = xmm9[0,1],xmm5[2,3] | |
vpermt2ps %zmm27, %zmm3, %zmm13 | |
vpermt2ps %zmm31, %zmm3, %zmm7 | |
vpermt2ps %zmm19, %zmm3, %zmm4 | |
vmovups %zmm0, 160(%rsp) # 64-byte Spill | |
vextractf64x4 $1, %zmm0, %ymm9 | |
vpunpckhdq %zmm16, %zmm28, %zmm0 # zmm0 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15] | |
vmovdqu64 %zmm0, 3184(%rsp) # 64-byte Spill | |
vshufpd $32, %zmm7, %zmm13, %zmm5 # zmm5 = zmm13[0],zmm7[0],zmm13[2],zmm7[2],zmm13[4],zmm7[5],zmm13[6],zmm7[6] | |
vmovdqa64 %zmm22, %zmm7 | |
vpermt2ps %zmm18, %zmm3, %zmm7 | |
vmovaps .LCPI0_17(%rip), %zmm3 # zmm3 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
vshufpd $128, %zmm7, %zmm4, %zmm5 {%k4} # zmm5 {%k4} = zmm4[0],zmm7[0],zmm4[2],zmm7[2],zmm4[4],zmm7[4],zmm4[6],zmm7[7] | |
vextracti64x4 $1, %zmm0, %ymm4 | |
vmovupd 352(%rsp), %zmm0 # 64-byte Reload | |
vmovapd %zmm10, %zmm7 | |
vmovupd %zmm5, 688(%rsp) # 64-byte Spill | |
vblendps $192, %ymm4, %ymm9, %ymm4 # ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] | |
vpunpckldq %zmm27, %zmm12, %zmm9 # zmm9 = zmm12[0],zmm27[0],zmm12[1],zmm27[1],zmm12[4],zmm27[4],zmm12[5],zmm27[5],zmm12[8],zmm27[8],zmm12[9],zmm27[9],zmm12[12],zmm27[12],zmm12[13],zmm27[13] | |
vpermt2ps %zmm31, %zmm3, %zmm2 | |
vmovaps %zmm3, %zmm6 | |
vshufpd $32, %zmm2, %zmm0, %zmm5 # zmm5 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[5],zmm0[6],zmm2[6] | |
vpermt2pd %zmm2, %zmm10, %zmm0 | |
vunpckhps %zmm17, %zmm29, %zmm2 # zmm2 = zmm29[2],zmm17[2],zmm29[3],zmm17[3],zmm29[6],zmm17[6],zmm29[7],zmm17[7],zmm29[10],zmm17[10],zmm29[11],zmm17[11],zmm29[14],zmm17[14],zmm29[15],zmm17[15] | |
vmovupd %zmm5, 624(%rsp) # 64-byte Spill | |
vmovaps %zmm14, %zmm5 | |
vpermt2ps %zmm30, %zmm3, %zmm5 | |
vmovupd %zmm2, 3056(%rsp) # 64-byte Spill | |
vextractf32x4 $3, %zmm2, %xmm2 | |
vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
vmovupd %zmm0, 1840(%rsp) # 64-byte Spill | |
vextractf32x4 $3, %zmm5, %xmm0 | |
vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3] | |
vmovups %ymm0, 3120(%rsp) # 32-byte Spill | |
vpunpckhdq %xmm27, %xmm12, %xmm0 # xmm0 = xmm12[2],xmm27[2],xmm12[3],xmm27[3] | |
vpermt2ps %zmm27, %zmm3, %zmm23 | |
vmovdqu64 %zmm0, 1136(%rsp) # 64-byte Spill | |
vinsertps $76, %xmm12, %xmm27, %xmm0 # xmm0 = xmm12[1],xmm27[1],zero,zero | |
vmovdqa64 %zmm15, %zmm27 | |
vpermt2ps %zmm19, %zmm3, %zmm27 | |
vmovdqa64 %zmm9, %zmm12 | |
vmovups %zmm0, 1200(%rsp) # 64-byte Spill | |
vblendps $15, %ymm8, %ymm4, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] | |
vpunpckhdq %zmm21, %zmm1, %zmm4 # zmm4 = zmm1[2],zmm21[2],zmm1[3],zmm21[3],zmm1[6],zmm21[6],zmm1[7],zmm21[7],zmm1[10],zmm21[10],zmm1[11],zmm21[11],zmm1[14],zmm21[14],zmm1[15],zmm21[15] | |
vmovaps %zmm6, %zmm8 | |
vmovups %ymm0, 880(%rsp) # 32-byte Spill | |
vmovdqa64 %zmm28, %zmm0 | |
vpermt2ps %zmm16, %zmm6, %zmm0 | |
vmovdqu64 %zmm4, 2736(%rsp) # 64-byte Spill | |
vextracti64x4 $1, %zmm4, %ymm4 | |
vextractf64x4 $1, %zmm0, %ymm2 | |
vpblendd $192, %ymm2, %ymm4, %ymm2 # ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] | |
vmovdqu %ymm2, 2800(%rsp) # 32-byte Spill | |
vpunpckldq %zmm31, %zmm24, %zmm2 # zmm2 = zmm24[0],zmm31[0],zmm24[1],zmm31[1],zmm24[4],zmm31[4],zmm24[5],zmm31[5],zmm24[8],zmm31[8],zmm24[9],zmm31[9],zmm24[12],zmm31[12],zmm24[13],zmm31[13] | |
vshufpd $32, %zmm2, %zmm23, %zmm4 # zmm4 = zmm23[0],zmm2[0],zmm23[2],zmm2[2],zmm23[4],zmm2[5],zmm23[6],zmm2[6] | |
vpermt2pd %zmm2, %zmm10, %zmm23 | |
vpunpckhdq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[2],zmm19[2],zmm15[3],zmm19[3],zmm15[6],zmm19[6],zmm15[7],zmm19[7],zmm15[10],zmm19[10],zmm15[11],zmm19[11],zmm15[14],zmm19[14],zmm15[15],zmm19[15] | |
vunpcklps %zmm30, %zmm14, %zmm10 # zmm10 = zmm14[0],zmm30[0],zmm14[1],zmm30[1],zmm14[4],zmm30[4],zmm14[5],zmm30[5],zmm14[8],zmm30[8],zmm14[9],zmm30[9],zmm14[12],zmm30[12],zmm14[13],zmm30[13] | |
vmovdqu64 %zmm2, 2928(%rsp) # 64-byte Spill | |
vpunpckldq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[0],zmm19[0],zmm15[1],zmm19[1],zmm15[4],zmm19[4],zmm15[5],zmm19[5],zmm15[8],zmm19[8],zmm15[9],zmm19[9],zmm15[12],zmm19[12],zmm15[13],zmm19[13] | |
vmovupd %zmm4, 1264(%rsp) # 64-byte Spill | |
vmovdqu64 %zmm2, 816(%rsp) # 64-byte Spill | |
vpunpckhdq %xmm19, %xmm15, %xmm2 # xmm2 = xmm15[2],xmm19[2],xmm15[3],xmm19[3] | |
vmovdqu64 %zmm2, 1968(%rsp) # 64-byte Spill | |
vinsertps $76, %xmm15, %xmm19, %xmm2 # xmm2 = xmm15[1],xmm19[1],zero,zero | |
vmovapd %zmm29, %zmm19 | |
vpermt2ps %zmm17, %zmm3, %zmm19 | |
vmovups %zmm2, 1008(%rsp) # 64-byte Spill | |
vextractf32x4 $3, %zmm10, %xmm2 | |
vextractf32x4 $2, %zmm10, %xmm10 | |
vextractf32x4 $3, %zmm19, %xmm4 | |
vblendps $3, %xmm4, %xmm2, %xmm4 # xmm4 = xmm4[0,1],xmm2[2,3] | |
vmovdqa64 %zmm1, %zmm2 | |
vpermt2ps %zmm21, %zmm3, %zmm2 | |
vpunpckldq %zmm16, %zmm28, %zmm3 # zmm3 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13] | |
vmovdqu64 %zmm3, 2544(%rsp) # 64-byte Spill | |
vextracti64x4 $1, %zmm3, %ymm3 | |
vextractf64x4 $1, %zmm2, %ymm15 | |
vblendps $192, %ymm3, %ymm15, %ymm3 # ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] | |
vextractf32x4 $3, %zmm25, %xmm15 | |
vblendps $15, %ymm4, %ymm3, %ymm3 # ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] | |
vpunpckldq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[0],xmm31[0],xmm24[1],xmm31[1] | |
vmovdqu64 %zmm4, 2608(%rsp) # 64-byte Spill | |
vpunpckhdq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[2],xmm31[2],xmm24[3],xmm31[3] | |
vmovups %ymm3, 1712(%rsp) # 32-byte Spill | |
vmovdqa64 %zmm24, %zmm3 | |
vmovdqu64 %zmm4, 1648(%rsp) # 64-byte Spill | |
vinsertps $76, %xmm24, %xmm31, %xmm4 # xmm4 = xmm24[1],xmm31[1],zero,zero | |
vmovaps %zmm14, %zmm24 | |
vmovups %zmm4, 944(%rsp) # 64-byte Spill | |
vmovaps .LCPI0_19(%rip), %zmm4 # zmm4 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
vpermt2ps %zmm31, %zmm4, %zmm3 | |
vpermt2ps %zmm30, %zmm4, %zmm24 | |
vmovaps %zmm14, %zmm31 | |
vpunpcklqdq %ymm28, %ymm16, %ymm14 # ymm14 = ymm16[0],ymm28[0],ymm16[2],ymm28[2] | |
vmovdqu %ymm14, 2672(%rsp) # 32-byte Spill | |
vpunpckldq %ymm16, %ymm28, %ymm14 # ymm14 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5] | |
vmovdqu %ymm14, 2864(%rsp) # 32-byte Spill | |
vinsertps $179, %xmm16, %xmm28, %xmm14 # xmm14 = zero,zero,xmm28[2],xmm16[2] | |
vmovaps %xmm14, 2992(%rsp) # 16-byte Spill | |
vunpckhps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[2],xmm16[2],xmm28[3],xmm16[3] | |
vmovaps %xmm14, 1520(%rsp) # 16-byte Spill | |
vunpcklps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[0],xmm16[0],xmm28[1],xmm16[1] | |
vshufpd $32, %zmm3, %zmm9, %zmm6 # zmm6 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[5],zmm9[6],zmm3[6] | |
vpermt2pd %zmm3, %zmm7, %zmm12 | |
vmovdqa64 %zmm22, %zmm7 | |
vpunpckldq %zmm18, %zmm22, %zmm9 # zmm9 = zmm22[0],zmm18[0],zmm22[1],zmm18[1],zmm22[4],zmm18[4],zmm22[5],zmm18[5],zmm22[8],zmm18[8],zmm22[9],zmm18[9],zmm22[12],zmm18[12],zmm22[13],zmm18[13] | |
vpunpckhdq %zmm18, %zmm22, %zmm3 # zmm3 = zmm22[2],zmm18[2],zmm22[3],zmm18[3],zmm22[6],zmm18[6],zmm22[7],zmm18[7],zmm22[10],zmm18[10],zmm22[11],zmm18[11],zmm22[14],zmm18[14],zmm22[15],zmm18[15] | |
vpermt2ps %zmm18, %zmm4, %zmm7 | |
vextractf32x4 $3, %zmm24, %xmm13 | |
vmovaps %xmm14, 1456(%rsp) # 16-byte Spill | |
vmovupd %zmm6, 352(%rsp) # 64-byte Spill | |
vpunpckldq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[0],xmm18[0],xmm22[1],xmm18[1] | |
vshufpd $128, %zmm9, %zmm27, %zmm23 {%k4} # zmm23 {%k4} = zmm27[0],zmm9[0],zmm27[2],zmm9[2],zmm27[4],zmm9[4],zmm27[6],zmm9[7] | |
vmovdqu64 %zmm6, 2480(%rsp) # 64-byte Spill | |
vpunpckhdq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[2],xmm18[2],xmm22[3],xmm18[3] | |
vblendps $3, %xmm15, %xmm13, %xmm13 # xmm13 = xmm15[0,1],xmm13[2,3] | |
vmovdqu64 %zmm6, 2032(%rsp) # 64-byte Spill | |
vinsertps $76, %xmm22, %xmm18, %xmm6 # xmm6 = xmm22[1],xmm18[1],zero,zero | |
vpermt2ps %zmm18, %zmm8, %zmm22 | |
vmovddup .LCPI0_194(%rip), %xmm8 # xmm8 = [4,0,4,0] | |
# xmm8 = mem[0,0] | |
vunpckhpd %ymm28, %ymm16, %ymm18 # ymm18 = ymm16[1],ymm28[1],ymm16[3],ymm28[3] | |
vmovups %zmm6, 1904(%rsp) # 64-byte Spill | |
vmovaps %zmm4, %zmm6 | |
vmovaps %xmm16, %xmm4 | |
vpermt2ps %xmm28, %xmm8, %xmm4 | |
vmovaps %zmm28, %zmm8 | |
vpermt2ps %zmm16, %zmm6, %zmm8 | |
vpunpckldq %zmm21, %zmm1, %zmm28 # zmm28 = zmm1[0],zmm21[0],zmm1[1],zmm21[1],zmm1[4],zmm21[4],zmm1[5],zmm21[5],zmm1[8],zmm21[8],zmm1[9],zmm21[9],zmm1[12],zmm21[12],zmm1[13],zmm21[13] | |
vmovupd 2288(%rsp), %zmm16 # 64-byte Reload | |
vextracti64x4 $1, %zmm28, %ymm14 | |
vextractf64x4 $1, %zmm8, %ymm15 | |
vpblendd $192, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] | |
vpblendd $15, %ymm13, %ymm14, %ymm6 # ymm6 = ymm13[0,1,2,3],ymm14[4,5,6,7] | |
vpermt2pd 2608(%rsp), %zmm16, %zmm26 # 64-byte Folded Reload | |
vpermt2pd 2480(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload | |
vmovdqu %ymm6, 1424(%rsp) # 32-byte Spill | |
vmovupd 1584(%rsp), %zmm6 # 64-byte Reload | |
vextractf32x4 $2, %zmm6, %xmm13 | |
vmovupd 1776(%rsp), %zmm6 # 64-byte Reload | |
vmovapd %zmm11, %zmm26 {%k4} | |
vunpckhps %xmm30, %xmm31, %xmm11 # xmm11 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] | |
vextractf32x4 $2, %zmm6, %xmm14 | |
vmovupd 160(%rsp), %zmm6 # 64-byte Reload | |
vblendpd $1, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0],xmm13[1] | |
vshuff64x2 $170, %zmm6, %zmm6, %zmm14 # zmm14 = zmm6[4,5,4,5,4,5,4,5] | |
vmovupd 3184(%rsp), %zmm6 # 64-byte Reload | |
vshuff64x2 $170, %zmm6, %zmm6, %zmm15 # zmm15 = zmm6[4,5,4,5,4,5,4,5] | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5] | |
vextractf32x4 $2, %zmm24, %xmm6 | |
vmovupd 816(%rsp), %zmm24 # 64-byte Reload | |
vblendpd $8, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm15[3] | |
vblendpd $3, %ymm13, %ymm14, %ymm15 # ymm15 = ymm13[0,1],ymm14[2,3] | |
vextractf32x4 $2, %zmm5, %xmm13 | |
vmovups 3056(%rsp), %zmm5 # 64-byte Reload | |
vshufpd $128, %zmm7, %zmm24, %zmm12 {%k4} # zmm12 {%k4} = zmm24[0],zmm7[0],zmm24[2],zmm7[2],zmm24[4],zmm7[4],zmm24[6],zmm7[7] | |
vextractf32x4 $2, %zmm5, %xmm14 | |
vmovupd 2736(%rsp), %zmm5 # 64-byte Reload | |
vblendps $3, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0,1],xmm13[2,3] | |
vshuff64x2 $170, %zmm5, %zmm5, %zmm14 # zmm14 = zmm5[4,5,4,5,4,5,4,5] | |
vunpcklps %xmm17, %xmm29, %xmm5 # xmm5 = xmm29[0],xmm17[0],xmm29[1],xmm17[1] | |
vblendpd $8, %ymm0, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm0[3] | |
vmovupd 2800(%rsp), %ymm0 # 32-byte Reload | |
vblendpd $3, %ymm13, %ymm14, %ymm13 # ymm13 = ymm13[0,1],ymm14[2,3] | |
vextractf32x4 $2, %zmm19, %xmm14 | |
vmovupd 416(%rsp), %zmm19 # 64-byte Reload | |
vblendpd $3, 3120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload | |
# ymm0 = mem[0,1],ymm0[2,3] | |
vblendps $3, %xmm14, %xmm10, %xmm10 # xmm10 = xmm14[0,1],xmm10[2,3] | |
vmovapd .LCPI0_20(%rip), %zmm14 # zmm14 = [0,8,0,8,4,12,4,13] | |
vshufpd $128, %zmm3, %zmm20, %zmm19 {%k4} # zmm19 {%k4} = zmm20[0],zmm3[0],zmm20[2],zmm3[2],zmm20[4],zmm3[4],zmm20[6],zmm3[7] | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[4,5,4,5,4,5,4,5] | |
vpermt2pd %zmm3, %zmm14, %zmm20 | |
vmovupd 2544(%rsp), %zmm3 # 64-byte Reload | |
vpermt2pd %zmm9, %zmm14, %zmm27 | |
vpermt2pd %zmm7, %zmm14, %zmm24 | |
vmovups 784(%rsp), %ymm7 # 32-byte Reload | |
vmovaps %ymm31, %ymm9 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm3[3] | |
vextractf32x4 $2, %zmm25, %xmm3 | |
vmovupd 2928(%rsp), %zmm25 # 64-byte Reload | |
vblendpd $3, %ymm10, %ymm2, %ymm2 # ymm2 = ymm10[0,1],ymm2[2,3] | |
vmovupd 1840(%rsp), %zmm10 # 64-byte Reload | |
vblendps $3, %xmm3, %xmm6, %xmm3 # xmm3 = xmm3[0,1],xmm6[2,3] | |
vshuff64x2 $170, %zmm8, %zmm8, %zmm6 # zmm6 = zmm8[4,5,4,5,4,5,4,5] | |
vshuff64x2 $170, %zmm28, %zmm28, %zmm8 # zmm8 = zmm28[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0,1,2],ymm6[3] | |
vinsertf128 $1, %xmm4, %ymm0, %ymm8 | |
vblendpd $3, %ymm3, %ymm6, %ymm3 # ymm3 = ymm3[0,1],ymm6[2,3] | |
vmovlhps %xmm31, %xmm30, %xmm6 # xmm6 = xmm30[0],xmm31[0] | |
vshufps $36, %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,0] | |
vpunpckldq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[0],xmm21[0],xmm1[1],xmm21[1] | |
vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3] | |
vshufpd $128, %zmm22, %zmm25, %zmm10 {%k4} # zmm10 {%k4} = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[7] | |
vpermt2pd %zmm22, %zmm14, %zmm25 | |
vbroadcastsd .LCPI0_30(%rip), %ymm14 # ymm14 = [5,13,5,13,5,13,5,13] | |
vshufps $51, %xmm29, %xmm17, %xmm8 # xmm8 = xmm17[3,0],xmm29[3,0] | |
vblendpd $3, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1],ymm6[2,3] | |
vunpckhps %ymm30, %ymm31, %ymm6 # ymm6 = ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[6],ymm30[6],ymm31[7],ymm30[7] | |
vinsertf64x4 $0, %ymm0, %zmm10, %zmm0 | |
vinsertps $179, %xmm30, %xmm31, %xmm10 # xmm10 = zero,zero,xmm31[2],xmm30[2] | |
vinsertf64x4 $0, %ymm5, %zmm26, %zmm4 | |
vunpckhps %ymm17, %ymm29, %ymm5 # ymm5 = ymm29[2],ymm17[2],ymm29[3],ymm17[3],ymm29[6],ymm17[6],ymm29[7],ymm17[7] | |
vpermpd $170, %ymm6, %ymm6 # ymm6 = ymm6[2,2,2,2] | |
vextractf128 $1, %ymm5, %xmm5 | |
vmovupd %zmm4, 416(%rsp) # 64-byte Spill | |
vblendps $3, %xmm5, %xmm6, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,3] | |
vpunpckhdq %ymm21, %ymm1, %ymm6 # ymm6 = ymm1[2],ymm21[2],ymm1[3],ymm21[3],ymm1[6],ymm21[6],ymm1[7],ymm21[7] | |
vshufps $36, %ymm18, %ymm6, %ymm6 # ymm6 = ymm6[0,1],ymm18[2,0],ymm6[4,5],ymm18[6,4] | |
vblendps $15, %ymm5, %ymm6, %ymm4 # ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
vmovups 288(%rsp), %zmm5 # 64-byte Reload | |
vunpckhps %xmm17, %xmm29, %xmm6 # xmm6 = xmm29[2],xmm17[2],xmm29[3],xmm17[3] | |
vpermt2ps %ymm30, %ymm14, %ymm9 | |
vmovaps .LCPI0_31(%rip), %ymm14 # ymm14 = [1,9,2,3,5,13,u,u] | |
vinsertf64x4 $0, %ymm4, %zmm5, %zmm26 | |
vunpcklps %ymm17, %ymm29, %ymm4 # ymm4 = ymm29[0],ymm17[0],ymm29[1],ymm17[1],ymm29[4],ymm17[4],ymm29[5],ymm17[5] | |
vunpcklps %ymm30, %ymm31, %ymm5 # ymm5 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[4],ymm30[4],ymm31[5],ymm30[5] | |
vextractf128 $1, %ymm4, %xmm4 | |
vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2] | |
vblendps $3, %xmm4, %xmm5, %xmm4 # xmm4 = xmm4[0,1],xmm5[2,3] | |
vpunpckldq %ymm21, %ymm1, %ymm5 # ymm5 = ymm1[0],ymm21[0],ymm1[1],ymm21[1],ymm1[4],ymm21[4],ymm1[5],ymm21[5] | |
vshufps $36, 2672(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
# ymm5 = ymm5[0,1],mem[2,0],ymm5[4,5],mem[6,4] | |
vblendps $15, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] | |
vmovups 560(%rsp), %zmm5 # 64-byte Reload | |
vmovupd %zmm0, 560(%rsp) # 64-byte Spill | |
vunpcklps %xmm30, %xmm31, %xmm0 # xmm0 = xmm31[0],xmm30[0],xmm31[1],xmm30[1] | |
vmovups (%rdi,%r10), %zmm30 {%k2} {z} | |
vinsertf64x4 $0, %ymm4, %zmm5, %zmm4 | |
vmovaps %ymm29, %ymm5 | |
vpermt2ps %ymm17, %ymm7, %ymm5 | |
vinsertf64x4 $0, 1712(%rsp), %zmm23, %zmm7 # 32-byte Folded Reload | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm30, %zmm31 | |
vmovups %zmm4, 288(%rsp) # 64-byte Spill | |
vinsertf64x4 $0, 880(%rsp), %zmm19, %zmm4 # 32-byte Folded Reload | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm19 {%k2} {z} | |
movq -32(%rsp), %rax # 8-byte Reload | |
vextractf128 $1, %ymm5, %xmm5 | |
vblendps $3, %xmm5, %xmm9, %xmm5 # xmm5 = xmm5[0,1],xmm9[2,3] | |
vmovdqa %ymm1, %ymm9 | |
vpermt2ps %ymm21, %ymm14, %ymm9 | |
vblendps $192, 2864(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload | |
# ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] | |
vmovupd %zmm7, 32(%rsp) # 64-byte Spill | |
vmovups 1328(%rsp), %zmm7 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm19, %zmm23 | |
vmovupd %zmm4, 160(%rsp) # 64-byte Spill | |
vinsertps $76, %xmm29, %xmm17, %xmm4 # xmm4 = xmm29[1],xmm17[1],zero,zero | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm29 {%k2} {z} | |
vblendps $15, %ymm5, %ymm9, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] | |
vinsertf128 $1, 2992(%rsp), %ymm0, %ymm9 # 16-byte Folded Reload | |
vinsertf64x4 $0, %ymm5, %zmm7, %zmm28 | |
vmovupd 1136(%rsp), %zmm5 # 64-byte Reload | |
vmovupd 1968(%rsp), %zmm7 # 64-byte Reload | |
vpermt2pd 1648(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload | |
vpermt2pd 2032(%rsp), %zmm16, %zmm7 # 64-byte Folded Reload | |
vmovapd %zmm7, %zmm5 {%k4} | |
vmovapd %zmm5, %zmm7 | |
vblendps $3, %xmm6, %xmm10, %xmm5 # xmm5 = xmm6[0,1],xmm10[2,3] | |
vinsertf64x4 $0, 1424(%rsp), %zmm12, %zmm6 # 32-byte Folded Reload | |
vmovups (%rdi,%rbp), %zmm10 {%k2} {z} | |
vmovupd %zmm6, 224(%rsp) # 64-byte Spill | |
vpunpckhdq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[2],xmm21[2],xmm1[3],xmm21[3] | |
vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
vblendps $192, %ymm9, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] | |
vmovupd 96(%rsp), %zmm9 # 64-byte Reload | |
vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
vmovdqa %xmm1, %xmm6 | |
vinsertf64x4 $0, %ymm5, %zmm7, %zmm17 | |
vinsertps $76, %xmm1, %xmm21, %xmm7 # xmm7 = xmm1[1],xmm21[1],zero,zero | |
vmovsd .LCPI0_195(%rip), %xmm1 # xmm1 = [3,7,0,0] | |
vshufps $226, %xmm11, %xmm8, %xmm5 # xmm5 = xmm8[2,0],xmm11[2,3] | |
vinsertf128 $1, 1520(%rsp), %ymm0, %ymm8 # 16-byte Folded Reload | |
vmovapd %zmm20, %zmm9 {%k4} | |
vinsertf64x4 $0, %ymm15, %zmm9, %zmm22 | |
vmovups (%rdi,%r13), %zmm15 {%k2} {z} | |
vmovups (%rdi,%r11), %zmm9 {%k2} {z} | |
vpermt2ps %xmm21, %xmm1, %xmm6 | |
vmovups 688(%rsp), %zmm1 # 64-byte Reload | |
vinsertf128 $1, %xmm6, %ymm0, %ymm6 | |
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
vinsertf128 $1, %xmm7, %ymm0, %ymm4 | |
vblendps $192, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] | |
vmovups (%rdi,%rdx), %zmm8 {%k2} {z} | |
vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] | |
vmovupd 1008(%rsp), %zmm6 # 64-byte Reload | |
vmovups %zmm15, 96(%rsp) # 64-byte Spill | |
vpermt2pd 1904(%rsp), %zmm16, %zmm6 # 64-byte Folded Reload | |
vinsertf64x4 $0, %ymm5, %zmm1, %zmm12 | |
vmovupd 1200(%rsp), %zmm5 # 64-byte Reload | |
vmovupd 624(%rsp), %zmm1 # 64-byte Reload | |
vpermt2pd 944(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload | |
vmovups (%rdi,%r15), %zmm16 {%k2} {z} | |
vmovapd %zmm25, %zmm1 {%k4} | |
vinsertf64x4 $0, %ymm13, %zmm1, %zmm14 | |
vmovups (%rdi,%r14), %zmm1 {%k2} {z} | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm16, %zmm25 | |
vmovapd %zmm6, %zmm5 {%k4} | |
vmovapd %zmm5, %zmm6 | |
vinsertf128 $1, 1456(%rsp), %ymm0, %ymm5 # 16-byte Folded Reload | |
vblendps $192, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] | |
vblendps $15, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rcx), %zmm4 {%k2} {z} | |
vinsertf64x4 $0, %ymm0, %zmm6, %zmm21 | |
vmovupd 1264(%rsp), %zmm0 # 64-byte Reload | |
vmovups (%rdi,%r8), %zmm6 {%k2} {z} | |
vmovups %zmm1, 1264(%rsp) # 64-byte Spill | |
vmovaps %zmm4, %zmm7 | |
vmovapd %zmm27, %zmm0 {%k4} | |
vinsertf64x4 $0, %ymm2, %zmm0, %zmm18 | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpbroadcastd %r9d, %zmm0 | |
vmovaps %zmm6, %zmm27 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
addl $-16, %r9d | |
vmovdqu64 %zmm0, 1200(%rsp) # 64-byte Spill | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_193(%rip), %zmm0, %k1 | |
vmovupd 352(%rsp), %zmm13 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vunpcklps %xmm27, %xmm30, %xmm0 # xmm0 = xmm30[0],xmm27[0],xmm30[1],xmm27[1] | |
vmovups %zmm4, 1904(%rsp) # 64-byte Spill | |
vmovaps %zmm8, %zmm4 | |
movq -96(%rsp), %rax # 8-byte Reload | |
vmovups %zmm16, 1648(%rsp) # 64-byte Spill | |
vmovups %zmm10, 1968(%rsp) # 64-byte Spill | |
vmovups %zmm6, 1776(%rsp) # 64-byte Spill | |
vmovups %zmm30, 1840(%rsp) # 64-byte Spill | |
vmovups %zmm4, 624(%rsp) # 64-byte Spill | |
vmovups %zmm9, 688(%rsp) # 64-byte Spill | |
vmovups %zmm29, 1712(%rsp) # 64-byte Spill | |
vmovlhps %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm10[0] | |
vinsertps $48, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[0] | |
vinsertf128 $1, %xmm15, %ymm0, %ymm2 | |
vinsertf128 $1, %xmm1, %ymm0, %ymm0 | |
vmovapd %zmm24, %zmm13 {%k4} | |
vinsertf64x4 $0, %ymm3, %zmm13, %zmm5 | |
vbroadcastss %xmm9, %ymm3 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm11 {%k1} {z} | |
movq 512(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps 416(%rsp), %zmm11, %zmm31 # 64-byte Folded Reload | |
# zmm31 = (zmm11 * zmm31) + mem | |
vblendps $32, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] | |
vshufpd $2, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] | |
vbroadcastss %xmm4, %ymm0 | |
vblendps $128, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm3 {%k2} {z} | |
movq 520(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vinsertf64x4 $1, %ymm16, %zmm0, %zmm13 | |
vmovaps .LCPI0_109(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,16,u,u,u,u,u,u] | |
vfmadd213ps %zmm5, %zmm11, %zmm25 # zmm25 = (zmm11 * zmm25) + zmm5 | |
vbroadcastss %xmm4, %zmm5 | |
vfmadd213ps %zmm18, %zmm11, %zmm23 # zmm23 = (zmm11 * zmm23) + zmm18 | |
vfmadd213ps 1072(%rsp), %zmm11, %zmm5 # 64-byte Folded Reload | |
# zmm5 = (zmm11 * zmm5) + mem | |
vmovups %zmm3, 1136(%rsp) # 64-byte Spill | |
vpermt2ps %zmm19, %zmm0, %zmm13 | |
vmovshdup %xmm13, %xmm0 # xmm0 = xmm13[1,1,3,3] | |
vbroadcastsd %xmm0, %zmm0 | |
vfmadd213ps %zmm21, %zmm11, %zmm0 # zmm0 = (zmm11 * zmm0) + zmm21 | |
vmovups %zmm0, 352(%rsp) # 64-byte Spill | |
vshufps $255, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[3,3,3,3] | |
vbroadcastsd %xmm0, %zmm20 | |
vshufps $170, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[2,2,2,2] | |
vfmadd213ps %zmm12, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm20) + zmm12 | |
vbroadcastsd %xmm0, %zmm12 | |
vshufps $170, %ymm2, %ymm2, %ymm0 # ymm0 = ymm2[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[2,3,2,3,2,3,2,3] | |
vshufps $85, %ymm13, %ymm13, %ymm0 # ymm0 = ymm13[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm16 # zmm16 = zmm0[2,3,2,3,2,3,2,3] | |
vextractf128 $1, %ymm13, %xmm0 | |
vfmadd213ps %zmm17, %zmm11, %zmm12 # zmm12 = (zmm11 * zmm12) + zmm17 | |
vmovaps %zmm30, %zmm17 | |
vbroadcastss %xmm3, %zmm13 | |
vmovaps %zmm3, %zmm30 | |
vfmadd213ps %zmm26, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm15) + zmm26 | |
vbroadcastss %xmm29, %zmm26 | |
vfmadd213ps %zmm28, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm16) + zmm28 | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm28 {%k2} {z} | |
movq -80(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm0, %zmm8 | |
vinsertps $76, %xmm17, %xmm27, %xmm0 # xmm0 = xmm17[1],xmm27[1],zero,zero | |
vfmadd213ps 288(%rsp), %zmm11, %zmm8 # 64-byte Folded Reload | |
# zmm8 = (zmm11 * zmm8) + mem | |
vfmadd213ps %zmm22, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm13) + zmm22 | |
vshufps $212, %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm10[1,3] | |
vmovups 1264(%rsp), %zmm10 # 64-byte Reload | |
vfmadd213ps %zmm14, %zmm11, %zmm26 # zmm26 = (zmm11 * zmm26) + zmm14 | |
vinsertps $112, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[1] | |
vmovups 96(%rsp), %zmm7 # 64-byte Reload | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm24 {%k2} {z} | |
movq -88(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm28, %zmm1 | |
vfmadd213ps 224(%rsp), %zmm11, %zmm1 # 64-byte Folded Reload | |
# zmm1 = (zmm11 * zmm1) + mem | |
vmovups %zmm28, 1328(%rsp) # 64-byte Spill | |
vmovshdup %xmm10, %xmm2 # xmm2 = xmm10[1,1,3,3] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm21 {%k2} {z} | |
movq -72(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
vblendps $240, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] | |
vinsertf128 $1, %xmm9, %ymm0, %ymm0 | |
vbroadcastss %xmm24, %zmm14 | |
vfmadd213ps 32(%rsp), %zmm11, %zmm14 # 64-byte Folded Reload | |
# zmm14 = (zmm11 * zmm14) + mem | |
vmovups %zmm24, 288(%rsp) # 64-byte Spill | |
vblendps $34, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] | |
vshufps $85, %xmm7, %xmm7, %xmm2 # xmm2 = xmm7[1,1,1,1] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovups (%rdi,%rax), %zmm6 {%k2} {z} | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
movq 528(%rsp), %rax # 8-byte Reload | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
addq $64, %rdi | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastss %xmm21, %zmm18 | |
vfmadd213ps 560(%rsp), %zmm11, %zmm18 # 64-byte Folded Reload | |
# zmm18 = (zmm11 * zmm18) + mem | |
vmovups %zmm21, 416(%rsp) # 64-byte Spill | |
vbroadcastss %xmm6, %zmm3 | |
vfmadd213ps 160(%rsp), %zmm11, %zmm3 # 64-byte Folded Reload | |
# zmm3 = (zmm11 * zmm3) + mem | |
vblendps $192, %ymm2, %ymm0, %ymm11 # ymm11 = ymm0[0,1,2,3,4,5],ymm2[6,7] | |
vmovdqu64 1200(%rsp), %zmm0 # 64-byte Reload | |
vmovups %zmm6, 560(%rsp) # 64-byte Spill | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_108(%rip){1to16}, %zmm0, %k1 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vbroadcastsd %xmm4, %ymm0 | |
vmovups 1648(%rsp), %zmm4 # 64-byte Reload | |
vblendps $128, %ymm0, %ymm11, %ymm0 # ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] | |
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm27 # zmm27 = zmm2[2,3,2,3,2,3,2,3] | |
vmovaps %zmm19, %zmm2 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm22 {%k1} {z} | |
movq -64(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps %zmm5, %zmm22, %zmm27 # zmm27 = (zmm22 * zmm27) + zmm5 | |
vshufps $170, %ymm11, %ymm11, %ymm5 # ymm5 = ymm11[2,2,2,2,6,6,6,6] | |
vmovaps .LCPI0_110(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,17,u,u,u,u,u,u,u] | |
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
vfmadd213ps %zmm15, %zmm22, %zmm5 # zmm5 = (zmm22 * zmm5) + zmm15 | |
vmovaps .LCPI0_112(%rip), %zmm15 # zmm15 = [0,1,2,3,4,5,6,7,8,9,17,u,u,u,u,u] | |
vpermt2ps %zmm4, %zmm11, %zmm0 | |
vmovaps .LCPI0_111(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,8,17,u,u,u,u,u,u] | |
vpermt2ps %zmm19, %zmm11, %zmm0 | |
vmovaps %zmm0, %zmm11 | |
vpermt2ps %zmm29, %zmm15, %zmm11 | |
vextractf128 $1, %ymm11, %xmm11 | |
vbroadcastss %xmm11, %zmm11 | |
vfmadd213ps %zmm8, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm8 | |
vshufps $85, %ymm0, %ymm0, %ymm8 # ymm8 = ymm0[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm8, %zmm8, %zmm15 # zmm15 = zmm8[2,3,2,3,2,3,2,3] | |
vshufps $170, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[2,2,2,2] | |
vbroadcastsd %xmm8, %zmm17 | |
vshufps $255, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[3,3,3,3] | |
vmovshdup %xmm0, %xmm0 # xmm0 = xmm0[1,1,3,3] | |
vbroadcastsd %xmm0, %zmm9 | |
vmovshdup %xmm6, %xmm0 # xmm0 = xmm6[1,1,3,3] | |
vbroadcastsd %xmm8, %zmm19 | |
vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16 | |
vmovaps %zmm2, %zmm6 | |
vfmadd213ps 352(%rsp), %zmm22, %zmm9 # 64-byte Folded Reload | |
# zmm9 = (zmm22 * zmm9) + mem | |
vmovups %zmm6, 1584(%rsp) # 64-byte Spill | |
vbroadcastsd %xmm0, %zmm0 | |
vfmadd213ps %zmm12, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm12 | |
vfmadd213ps %zmm20, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm20 | |
vfmadd213ps %zmm3, %zmm22, %zmm0 # zmm0 = (zmm22 * zmm0) + zmm3 | |
vmovshdup %xmm21, %xmm3 # xmm3 = xmm21[1,1,3,3] | |
vbroadcastsd %xmm3, %zmm8 | |
vmovshdup %xmm24, %xmm3 # xmm3 = xmm24[1,1,3,3] | |
vmovups 1776(%rsp), %zmm24 # 64-byte Reload | |
vbroadcastsd %xmm3, %zmm12 | |
vmovshdup %xmm28, %xmm3 # xmm3 = xmm28[1,1,3,3] | |
vmovaps %zmm4, %zmm28 | |
vpermpd $85, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1] | |
vfmadd213ps %zmm18, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm18 | |
vfmadd213ps %zmm14, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm14 | |
vbroadcastsd %xmm3, %zmm14 | |
vmovups 1840(%rsp), %zmm3 # 64-byte Reload | |
vfmadd213ps %zmm1, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm1 | |
vmovshdup %xmm30, %xmm1 # xmm1 = xmm30[1,1,3,3] | |
vmovaps %zmm29, %zmm30 | |
vbroadcastsd %xmm1, %zmm16 | |
vmovshdup %xmm29, %xmm1 # xmm1 = xmm29[1,1,3,3] | |
vmovaps %zmm28, %zmm29 | |
vbroadcastsd %xmm1, %zmm18 | |
vmovshdup %xmm2, %xmm1 # xmm1 = xmm2[1,1,3,3] | |
vmovups 1904(%rsp), %zmm2 # 64-byte Reload | |
vfmadd213ps %zmm13, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm13 | |
vbroadcastsd %xmm1, %zmm13 | |
vmovshdup %xmm28, %xmm1 # xmm1 = xmm28[1,1,3,3] | |
vbroadcastsd %xmm1, %zmm20 | |
vfmadd213ps %zmm26, %zmm22, %zmm18 # zmm18 = (zmm22 * zmm18) + zmm26 | |
vmovdqu64 1200(%rsp), %zmm26 # 64-byte Reload | |
vfmadd213ps %zmm23, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm23 | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_9(%rip){1to16}, %zmm26, %k1 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovshdup %xmm3, %xmm1 # xmm1 = xmm3[1,1,3,3] | |
vfmadd213ps %zmm25, %zmm22, %zmm20 # zmm20 = (zmm22 * zmm20) + zmm25 | |
vmovaps %zmm3, %zmm25 | |
vbroadcastss %xmm1, %zmm21 | |
vunpckhps %xmm24, %xmm3, %xmm1 # xmm1 = xmm3[2],xmm24[2],xmm3[3],xmm24[3] | |
vblendps $12, 1968(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload | |
# xmm1 = xmm1[0,1],mem[2,3] | |
vpermilps $170, 688(%rsp), %xmm3 # 16-byte Folded Reload | |
# xmm3 = mem[2,2,2,2] | |
vfmadd213ps %zmm31, %zmm22, %zmm21 # zmm21 = (zmm22 * zmm21) + zmm31 | |
vmovaps %zmm6, %zmm31 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm22 {%k1} {z} | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_107(%rip){1to16}, %zmm26, %k1 | |
movq 536(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vinsertps $176, %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2],xmm2[2] | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vblendps $240, %ymm4, %ymm1, %ymm4 # ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] | |
vblendps $32, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] | |
vinsertf128 $1, %xmm7, %ymm1, %ymm4 | |
vblendps $204, %ymm4, %ymm3, %ymm1 # ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] | |
vmovsldup 624(%rsp), %xmm3 # 16-byte Folded Reload | |
# xmm3 = mem[0,0,2,2] | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vblendps $128, %ymm3, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] | |
vmovapd .LCPI0_113(%rip), %zmm3 # zmm3 = [0,1,2,3,9,u,u,u] | |
vmovaps %zmm1, %zmm23 | |
vshufps $255, %ymm1, %ymm1, %ymm1 # ymm1 = ymm1[3,3,3,3,7,7,7,7] | |
vpermt2pd %zmm28, %zmm3, %zmm23 | |
vmovaps .LCPI0_114(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,18,u,u,u,u,u,u] | |
vpermt2ps %zmm6, %zmm3, %zmm23 | |
vmovshdup %xmm23, %xmm3 # xmm3 = xmm23[1,1,3,3] | |
vshufps $170, %xmm23, %xmm23, %xmm7 # xmm7 = xmm23[2,2,2,2] | |
vshufps $255, %xmm23, %xmm23, %xmm6 # xmm6 = xmm23[3,3,3,3] | |
vbroadcastsd %xmm3, %zmm28 | |
vbroadcastsd %xmm7, %zmm7 | |
vbroadcastsd %xmm6, %zmm6 | |
vfmadd213ps %zmm9, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm28) + zmm9 | |
vshufps $85, %ymm23, %ymm23, %ymm9 # ymm9 = ymm23[1,1,1,1,5,5,5,5] | |
vfmadd213ps %zmm17, %zmm22, %zmm7 # zmm7 = (zmm22 * zmm7) + zmm17 | |
vshuff64x2 $85, %zmm1, %zmm1, %zmm17 # zmm17 = zmm1[2,3,2,3,2,3,2,3] | |
vshufps $170, %ymm4, %ymm4, %ymm1 # ymm1 = ymm4[2,2,2,2,6,6,6,6] | |
vfmadd213ps %zmm19, %zmm22, %zmm6 # zmm6 = (zmm22 * zmm6) + zmm19 | |
vmovups 1968(%rsp), %zmm4 # 64-byte Reload | |
vshuff64x2 $85, %zmm9, %zmm9, %zmm10 # zmm10 = zmm9[2,3,2,3,2,3,2,3] | |
vextractf32x4 $1, %ymm23, %xmm9 | |
vshuff64x2 $85, %zmm1, %zmm1, %zmm19 # zmm19 = zmm1[2,3,2,3,2,3,2,3] | |
vfmadd213ps %zmm27, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm27 | |
vmovups 1328(%rsp), %zmm27 # 64-byte Reload | |
vbroadcastss %xmm9, %zmm3 | |
vfmadd213ps %zmm15, %zmm22, %zmm10 # zmm10 = (zmm22 * zmm10) + zmm15 | |
vfmadd213ps %zmm5, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm5 | |
vmovups 1264(%rsp), %zmm5 # 64-byte Reload | |
vfmadd213ps %zmm11, %zmm22, %zmm3 # zmm3 = (zmm22 * zmm3) + zmm11 | |
vmovups %zmm3, 32(%rsp) # 64-byte Spill | |
vmovaps %zmm25, %zmm3 | |
vshufpd $1, %xmm3, %xmm3, %xmm1 # xmm1 = xmm3[1,0] | |
vmovups 560(%rsp), %zmm25 # 64-byte Reload | |
vbroadcastss %xmm1, %zmm23 | |
vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2] | |
vbroadcastsd %xmm1, %zmm9 | |
vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2] | |
vmovups 416(%rsp), %zmm31 # 64-byte Reload | |
vfmadd213ps %zmm21, %zmm22, %zmm23 # zmm23 = (zmm22 * zmm23) + zmm21 | |
vmovaps %zmm29, %zmm21 | |
vmovups 1136(%rsp), %zmm29 # 64-byte Reload | |
vbroadcastsd %xmm1, %zmm11 | |
vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2] | |
vmovups 288(%rsp), %zmm30 # 64-byte Reload | |
vfmadd213ps %zmm20, %zmm22, %zmm9 # zmm9 = (zmm22 * zmm9) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm20 {%k1} {z} | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_8(%rip){1to16}, %zmm26, %k1 | |
movq -56(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps %zmm13, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm13 | |
vbroadcastsd %xmm1, %zmm13 | |
vfmadd213ps %zmm18, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm18 | |
vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2] | |
vbroadcastsd %xmm1, %zmm15 | |
vshufps $170, %xmm27, %xmm27, %xmm1 # xmm1 = xmm27[2,2,2,2] | |
vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16 | |
vbroadcastsd %xmm1, %zmm16 | |
vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2] | |
vfmadd213ps %zmm14, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm14 | |
vbroadcastsd %xmm1, %zmm14 | |
vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2] | |
vfmadd213ps %zmm12, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm12 | |
vbroadcastsd %xmm1, %zmm12 | |
vshufps $170, %xmm25, %xmm25, %xmm1 # xmm1 = xmm25[2,2,2,2] | |
vfmadd213ps %zmm8, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm8 | |
vbroadcastsd %xmm1, %zmm8 | |
vshufps $255, %xmm5, %xmm5, %xmm1 # xmm1 = xmm5[3,3,3,3] | |
vfmadd213ps %zmm0, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm0 | |
vshufps $51, %xmm3, %xmm24, %xmm0 # xmm0 = xmm24[3,0],xmm3[3,0] | |
vmovups 688(%rsp), %zmm3 # 64-byte Reload | |
vshufps $242, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[2,0],xmm4[3,3] | |
vblendps $8, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm2[3] | |
vinsertf128 $1, %xmm1, %ymm0, %ymm1 | |
vblendps $240, %ymm1, %ymm0, %ymm1 # ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] | |
vpermpd $85, %ymm3, %ymm2 # ymm2 = ymm3[1,1,1,1] | |
vblendps $32, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] | |
vmovshdup 96(%rsp), %xmm2 # 16-byte Folded Reload | |
# xmm2 = mem[1,1,3,3] | |
vinsertf128 $1, %xmm2, %ymm0, %ymm2 | |
vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] | |
vshufps $170, %ymm1, %ymm1, %ymm2 # ymm2 = ymm1[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm24 # zmm24 = zmm2[2,3,2,3,2,3,2,3] | |
vmovups 624(%rsp), %zmm2 # 64-byte Reload | |
vfmadd213ps %zmm19, %zmm20, %zmm24 # zmm24 = (zmm20 * zmm24) + zmm19 | |
vinsertf128 $1, %xmm2, %ymm0, %ymm0 | |
vblendps $136, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] | |
vshufps $255, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[3,3,3,3,7,7,7,7] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm18 # zmm18 = zmm0[2,3,2,3,2,3,2,3] | |
vmovaps .LCPI0_115(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,19,u,u,u,u,u,u,u] | |
vfmadd213ps %zmm17, %zmm20, %zmm18 # zmm18 = (zmm20 * zmm18) + zmm17 | |
vmovups 1584(%rsp), %zmm17 # 64-byte Reload | |
vpermt2ps %zmm21, %zmm0, %zmm1 | |
vmovaps .LCPI0_116(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,19,u,u,u,u,u,u] | |
vpermt2ps %zmm17, %zmm0, %zmm1 | |
vshufps $85, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
vfmadd213ps %zmm10, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm10 | |
vmovups %zmm0, 160(%rsp) # 64-byte Spill | |
vshufps $170, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[2,2,2,2] | |
vbroadcastsd %xmm0, %zmm22 | |
vshufps $255, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[3,3,3,3] | |
vfmadd213ps %zmm7, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm22) + zmm7 | |
vbroadcastsd %xmm0, %zmm7 | |
vmovshdup %xmm1, %xmm0 # xmm0 = xmm1[1,1,3,3] | |
vbroadcastsd %xmm0, %zmm10 | |
vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3] | |
vmovaps %zmm29, %zmm25 | |
vmovups 1840(%rsp), %zmm29 # 64-byte Reload | |
vbroadcastsd %xmm0, %zmm0 | |
vfmadd213ps %zmm6, %zmm20, %zmm7 # zmm7 = (zmm20 * zmm7) + zmm6 | |
vmovups 1904(%rsp), %zmm6 # 64-byte Reload | |
vfmadd213ps %zmm28, %zmm20, %zmm10 # zmm10 = (zmm20 * zmm10) + zmm28 | |
vfmadd213ps %zmm8, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm8 | |
vmovups %zmm0, 224(%rsp) # 64-byte Spill | |
vshufps $255, %xmm31, %xmm31, %xmm0 # xmm0 = xmm31[3,3,3,3] | |
vmovaps %zmm21, %zmm31 | |
vbroadcastsd %xmm0, %zmm19 | |
vshufps $255, %xmm30, %xmm30, %xmm0 # xmm0 = xmm30[3,3,3,3] | |
vmovaps %zmm17, %zmm30 | |
vbroadcastsd %xmm0, %zmm8 | |
vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3] | |
vmovups 1712(%rsp), %zmm27 # 64-byte Reload | |
vfmadd213ps %zmm12, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm19) + zmm12 | |
vbroadcastsd %xmm0, %zmm12 | |
vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3] | |
vfmadd213ps %zmm14, %zmm20, %zmm8 # zmm8 = (zmm20 * zmm8) + zmm14 | |
vbroadcastsd %xmm0, %zmm14 | |
vfmadd213ps %zmm16, %zmm20, %zmm12 # zmm12 = (zmm20 * zmm12) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm16 {%k1} {z} | |
movq (%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps %zmm15, %zmm20, %zmm14 # zmm14 = (zmm20 * zmm14) + zmm15 | |
vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3] | |
vbroadcastsd %xmm0, %zmm15 | |
vshufps $255, %xmm17, %xmm17, %xmm0 # xmm0 = xmm17[3,3,3,3] | |
vfmadd213ps %zmm13, %zmm20, %zmm15 # zmm15 = (zmm20 * zmm15) + zmm13 | |
vbroadcastsd %xmm0, %zmm13 | |
vshufps $255, %xmm21, %xmm21, %xmm0 # xmm0 = xmm21[3,3,3,3] | |
vfmadd213ps %zmm11, %zmm20, %zmm13 # zmm13 = (zmm20 * zmm13) + zmm11 | |
vbroadcastsd %xmm0, %zmm11 | |
vshufps $255, %xmm29, %xmm29, %xmm0 # xmm0 = xmm29[3,3,3,3] | |
vbroadcastss %xmm0, %zmm17 | |
vmovaps .LCPI0_117(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,19,u,u,u,u,u] | |
vfmadd213ps %zmm9, %zmm20, %zmm11 # zmm11 = (zmm20 * zmm11) + zmm9 | |
vfmadd213ps %zmm23, %zmm20, %zmm17 # zmm17 = (zmm20 * zmm17) + zmm23 | |
vpermt2ps %zmm27, %zmm0, %zmm1 | |
vextractf128 $1, %ymm1, %xmm0 | |
vbroadcastss %xmm0, %zmm9 | |
vfmadd213ps 32(%rsp), %zmm20, %zmm9 # 64-byte Folded Reload | |
# zmm9 = (zmm20 * zmm9) + mem | |
vmovups 1776(%rsp), %zmm20 # 64-byte Reload | |
vunpcklps %ymm20, %ymm29, %ymm0 # ymm0 = ymm29[0],ymm20[0],ymm29[1],ymm20[1],ymm29[4],ymm20[4],ymm29[5],ymm20[5] | |
vextractf128 $1, %ymm0, %xmm1 | |
vextractf128 $1, %ymm4, %xmm0 | |
vextractf128 $1, %ymm6, %xmm4 | |
vmovaps %xmm0, 880(%rsp) # 16-byte Spill | |
vmovlhps %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] | |
vmovaps %xmm4, 2032(%rsp) # 16-byte Spill | |
vinsertps $48, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm4[0] | |
vextractf128 $1, %ymm3, %xmm4 | |
vmovaps %zmm31, %zmm3 | |
vblendps $240, %ymm5, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] | |
vmovaps %zmm30, %zmm5 | |
vbroadcastss %xmm4, %ymm4 | |
vblendps $32, %ymm4, %ymm0, %ymm4 # ymm4 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] | |
vmovapd .LCPI0_118(%rip), %ymm0 # ymm0 = [0,1,2,6] | |
vpermt2pd 96(%rsp), %ymm0, %ymm4 # 32-byte Folded Reload | |
vextractf128 $1, %ymm2, %xmm0 | |
vbroadcastss %xmm0, %zmm0 | |
vfmadd231ps %zmm0, %zmm16, %zmm18 # zmm18 = (zmm16 * zmm0) + zmm18 | |
vblendps $128, %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] | |
vshufps $170, %ymm4, %ymm4, %ymm4 # ymm4 = ymm4[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm21 # zmm21 = zmm4[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm1, %zmm4 | |
vextractf128 $1, %ymm3, %xmm1 | |
vshuff64x2 $212, %zmm31, %zmm0, %zmm0 # zmm0 = zmm0[0,1,2,3],zmm31[2,3,6,7] | |
vmovups 288(%rsp), %zmm3 # 64-byte Reload | |
vmovups 784(%rsp), %ymm31 # 32-byte Reload | |
vfmadd213ps %zmm24, %zmm16, %zmm21 # zmm21 = (zmm16 * zmm21) + zmm24 | |
vbroadcastss %xmm1, %zmm24 | |
vextractf128 $1, %ymm5, %xmm1 | |
vfmadd213ps %zmm17, %zmm16, %zmm4 # zmm4 = (zmm16 * zmm4) + zmm17 | |
vbroadcastss %xmm1, %zmm23 | |
vextractf32x4 $1, %ymm27, %xmm1 | |
vfmadd213ps %zmm11, %zmm16, %zmm24 # zmm24 = (zmm16 * zmm24) + zmm11 | |
vbroadcastss %xmm1, %zmm2 | |
vmovaps .LCPI0_119(%rip), %zmm1 # zmm1 = [0,1,2,3,4,5,6,7,8,20,u,u,u,u,u,u] | |
vfmadd213ps %zmm13, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm23) + zmm13 | |
vextractf128 $1, %ymm3, %xmm5 | |
vmovups 416(%rsp), %zmm3 # 64-byte Reload | |
vfmadd213ps %zmm15, %zmm16, %zmm2 # zmm2 = (zmm16 * zmm2) + zmm15 | |
vbroadcastss %xmm5, %zmm5 | |
vfmadd213ps %zmm8, %zmm16, %zmm5 # zmm5 = (zmm16 * zmm5) + zmm8 | |
vmovupd 2288(%rsp), %zmm8 # 64-byte Reload | |
vpermt2ps %zmm30, %zmm1, %zmm0 | |
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] | |
vbroadcastsd %xmm1, %zmm28 | |
vextractf32x4 $1, %ymm25, %xmm1 | |
vmovaps %zmm2, %zmm25 | |
vbroadcastss %xmm1, %zmm26 | |
vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3] | |
vfmadd213ps %zmm10, %zmm16, %zmm28 # zmm28 = (zmm16 * zmm28) + zmm10 | |
vinsertps $76, %xmm24, %xmm23, %xmm10 # xmm10 = xmm24[1],xmm23[1],zero,zero | |
vbroadcastsd %xmm1, %zmm17 | |
vmovups 1328(%rsp), %zmm1 # 64-byte Reload | |
vfmadd213ps %zmm14, %zmm16, %zmm26 # zmm26 = (zmm16 * zmm26) + zmm14 | |
vfmadd213ps %zmm7, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm17) + zmm7 | |
vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm7, %zmm7, %zmm27 # zmm27 = zmm7[2,3,2,3,2,3,2,3] | |
vextractf128 $1, %ymm3, %xmm7 | |
vpermpd $170, %ymm6, %ymm3 # ymm3 = ymm6[2,2,2,2] | |
vunpcklps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[0],xmm23[0],xmm24[1],xmm23[1] | |
vfmadd213ps 160(%rsp), %zmm16, %zmm27 # 64-byte Folded Reload | |
# zmm27 = (zmm16 * zmm27) + mem | |
vbroadcastss %xmm7, %zmm13 | |
vfmadd213ps %zmm19, %zmm16, %zmm13 # zmm13 = (zmm16 * zmm13) + zmm19 | |
vunpcklps %zmm23, %zmm24, %zmm19 # zmm19 = zmm24[0],zmm23[0],zmm24[1],zmm23[1],zmm24[4],zmm23[4],zmm24[5],zmm23[5],zmm24[8],zmm23[8],zmm24[9],zmm23[9],zmm24[12],zmm23[12],zmm24[13],zmm23[13] | |
vextractf128 $1, %ymm1, %xmm1 | |
vbroadcastss %xmm1, %zmm14 | |
vshufps $170, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[2,2,2,2] | |
vextractf128 $1, %ymm0, %xmm0 | |
vbroadcastsd %xmm1, %zmm1 | |
vbroadcastss %xmm0, %zmm7 | |
vmovups 560(%rsp), %zmm0 # 64-byte Reload | |
vfmadd213ps %zmm12, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm14) + zmm12 | |
vmovapd %zmm8, %zmm12 | |
vfmadd213ps %zmm22, %zmm16, %zmm1 # zmm1 = (zmm16 * zmm1) + zmm22 | |
vfmadd213ps %zmm9, %zmm16, %zmm7 # zmm7 = (zmm16 * zmm7) + zmm9 | |
vinsertps $76, %xmm4, %xmm28, %xmm9 # xmm9 = xmm4[1],xmm28[1],zero,zero | |
vextractf128 $1, %ymm0, %xmm0 | |
vbroadcastss %xmm0, %zmm11 | |
vmovaps %ymm29, %ymm0 | |
vpermt2ps %ymm20, %ymm31, %ymm0 | |
vfmadd213ps 224(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload | |
# zmm11 = (zmm16 * zmm11) + mem | |
vmovaps %zmm13, %zmm16 | |
vmovaps .LCPI0_18(%rip), %zmm29 # zmm29 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
vmovaps .LCPI0_15(%rip), %zmm20 # zmm20 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
vextractf128 $1, %ymm0, %xmm0 | |
vshufps $212, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
# xmm0 = xmm0[0,1],mem[1,3] | |
vblendps $8, %xmm3, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm3[3] | |
vunpcklps %xmm5, %xmm14, %xmm3 # xmm3 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] | |
vmovups %ymm0, 224(%rsp) # 32-byte Spill | |
vunpcklps %xmm11, %xmm13, %xmm0 # xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] | |
vpermt2pd %zmm0, %zmm8, %zmm3 | |
vunpcklps %xmm26, %xmm2, %xmm0 # xmm0 = xmm2[0],xmm26[0],xmm2[1],xmm26[1] | |
vpermt2pd %zmm0, %zmm8, %zmm6 | |
vmovaps %xmm18, %xmm0 | |
vunpcklps %xmm28, %xmm4, %xmm8 # xmm8 = xmm4[0],xmm28[0],xmm4[1],xmm28[1] | |
vmovapd %zmm3, %zmm6 {%k4} | |
vmovddup .LCPI0_194(%rip), %xmm3 # xmm3 = [4,0,4,0] | |
# xmm3 = mem[0,0] | |
vpermt2ps %xmm21, %xmm3, %xmm0 | |
vunpcklps %xmm27, %xmm7, %xmm3 # xmm3 = xmm7[0],xmm27[0],xmm7[1],xmm27[1] | |
vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] | |
vmovlhps %xmm1, %xmm17, %xmm3 # xmm3 = xmm17[0],xmm1[0] | |
vshufps $36, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,0] | |
vinsertps $76, %xmm14, %xmm5, %xmm8 # xmm8 = xmm14[1],xmm5[1],zero,zero | |
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
vinsertps $76, %xmm13, %xmm11, %xmm3 # xmm3 = xmm13[1],xmm11[1],zero,zero | |
vpermt2pd %zmm3, %zmm12, %zmm8 | |
vinsertps $76, %xmm2, %xmm26, %xmm3 # xmm3 = xmm2[1],xmm26[1],zero,zero | |
vpermt2pd %zmm3, %zmm12, %zmm10 | |
vunpcklps %xmm18, %xmm21, %xmm3 # xmm3 = xmm21[0],xmm18[0],xmm21[1],xmm18[1] | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vmovapd %zmm8, %zmm10 {%k4} | |
vinsertps $76, %xmm7, %xmm27, %xmm8 # xmm8 = xmm7[1],xmm27[1],zero,zero | |
vinsertf128 $1, %xmm8, %ymm0, %ymm8 | |
vinsertf64x4 $0, %ymm0, %zmm6, %zmm0 | |
vunpckhps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[2],xmm23[2],xmm24[3],xmm23[3] | |
vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] | |
vunpcklps %xmm17, %xmm1, %xmm8 # xmm8 = xmm1[0],xmm17[0],xmm1[1],xmm17[1] | |
vmovupd %zmm0, 1008(%rsp) # 64-byte Spill | |
vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3] | |
vunpckhps %xmm28, %xmm4, %xmm9 # xmm9 = xmm4[2],xmm28[2],xmm4[3],xmm28[3] | |
vblendps $15, %ymm8, %ymm3, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] | |
vunpckhps %xmm11, %xmm13, %xmm3 # xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] | |
vunpckhps %xmm5, %xmm14, %xmm8 # xmm8 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] | |
vpermt2pd %zmm3, %zmm12, %zmm8 | |
vunpckhps %xmm26, %xmm2, %xmm3 # xmm3 = xmm2[2],xmm26[2],xmm2[3],xmm26[3] | |
vpermt2pd %zmm3, %zmm12, %zmm6 | |
vinsertps $179, %xmm18, %xmm21, %xmm3 # xmm3 = zero,zero,xmm21[2],xmm18[2] | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vmovapd %zmm8, %zmm6 {%k4} | |
vunpckhps %xmm27, %xmm7, %xmm8 # xmm8 = xmm7[2],xmm27[2],xmm7[3],xmm27[3] | |
vinsertf128 $1, %xmm8, %ymm0, %ymm8 | |
vinsertf64x4 $0, %ymm0, %zmm10, %zmm0 | |
vmovaps .LCPI0_19(%rip), %zmm10 # zmm10 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] | |
vinsertps $179, %xmm17, %xmm1, %xmm8 # xmm8 = zero,zero,xmm1[2],xmm17[2] | |
vmovupd %zmm0, 1520(%rsp) # 64-byte Spill | |
vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3] | |
vblendps $15, %ymm8, %ymm3, %ymm9 # ymm9 = ymm8[0,1,2,3],ymm3[4,5,6,7] | |
vmovapd .LCPI0_20(%rip), %zmm3 # zmm3 = [0,8,0,8,4,12,4,13] | |
vunpcklps %zmm5, %zmm14, %zmm8 # zmm8 = zmm14[0],zmm5[0],zmm14[1],zmm5[1],zmm14[4],zmm5[4],zmm14[5],zmm5[5],zmm14[8],zmm5[8],zmm14[9],zmm5[9],zmm14[12],zmm5[12],zmm14[13],zmm5[13] | |
vmovups %zmm8, 2544(%rsp) # 64-byte Spill | |
vinsertf64x4 $0, %ymm9, %zmm6, %zmm6 | |
vmovaps %zmm2, %zmm9 | |
vmovupd %zmm6, 32(%rsp) # 64-byte Spill | |
vunpckhps %zmm23, %zmm24, %zmm6 # zmm6 = zmm24[2],zmm23[2],zmm24[3],zmm23[3],zmm24[6],zmm23[6],zmm24[7],zmm23[7],zmm24[10],zmm23[10],zmm24[11],zmm23[11],zmm24[14],zmm23[14],zmm24[15],zmm23[15] | |
vpermt2ps %zmm11, %zmm10, %zmm16 | |
vpermt2ps %zmm26, %zmm10, %zmm25 | |
vmovaps %zmm10, %zmm12 | |
vpermt2pd %zmm16, %zmm3, %zmm8 | |
vshufpd $32, %zmm25, %zmm19, %zmm0 # zmm0 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[5],zmm19[6],zmm25[6] | |
vmovapd %zmm8, %zmm0 {%k4} | |
vmovaps %zmm21, %zmm8 | |
vpermt2ps %zmm18, %zmm10, %zmm8 | |
vunpcklps %zmm27, %zmm7, %zmm10 # zmm10 = zmm7[0],zmm27[0],zmm7[1],zmm27[1],zmm7[4],zmm27[4],zmm7[5],zmm27[5],zmm7[8],zmm27[8],zmm7[9],zmm27[9],zmm7[12],zmm27[12],zmm7[13],zmm27[13] | |
vmovups %zmm10, 816(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[4,5,4,5,4,5,4,5] | |
vmovups %zmm8, 352(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0,1,2],ymm8[3] | |
vmovaps %zmm1, %zmm10 | |
vpermt2ps %zmm17, %zmm12, %zmm10 | |
vunpcklps %zmm28, %zmm4, %zmm12 # zmm12 = zmm4[0],zmm28[0],zmm4[1],zmm28[1],zmm4[4],zmm28[4],zmm4[5],zmm28[5],zmm4[8],zmm28[8],zmm4[9],zmm28[9],zmm4[12],zmm28[12],zmm4[13],zmm28[13] | |
vmovups %zmm12, 3120(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm12, %xmm12 | |
vmovups %zmm10, 3184(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm10, %xmm10 | |
vblendps $3, %xmm12, %xmm10, %xmm10 # xmm10 = xmm12[0,1],xmm10[2,3] | |
vblendpd $3, %ymm10, %ymm8, %ymm8 # ymm8 = ymm10[0,1],ymm8[2,3] | |
vunpcklps %zmm17, %zmm1, %zmm10 # zmm10 = zmm1[0],zmm17[0],zmm1[1],zmm17[1],zmm1[4],zmm17[4],zmm1[5],zmm17[5],zmm1[8],zmm17[8],zmm1[9],zmm17[9],zmm1[12],zmm17[12],zmm1[13],zmm17[13] | |
vmovapd %ymm8, %ymm22 | |
vmovaps %zmm4, %zmm8 | |
vpermt2ps %zmm28, %zmm29, %zmm8 | |
vmovups %zmm10, 3056(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm10, %xmm10 | |
vextractf32x4 $2, %zmm8, %xmm12 | |
vmovups %zmm8, 2992(%rsp) # 64-byte Spill | |
vblendps $3, %xmm12, %xmm10, %xmm8 # xmm8 = xmm12[0,1],xmm10[2,3] | |
vmovaps %zmm13, %zmm12 | |
vunpckhps %zmm5, %zmm14, %zmm10 # zmm10 = zmm14[2],zmm5[2],zmm14[3],zmm5[3],zmm14[6],zmm5[6],zmm14[7],zmm5[7],zmm14[10],zmm5[10],zmm14[11],zmm5[11],zmm14[14],zmm5[14],zmm14[15],zmm5[15] | |
vmovups %ymm8, 1424(%rsp) # 32-byte Spill | |
vmovaps .LCPI0_17(%rip), %zmm8 # zmm8 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
vmovups %zmm10, 2480(%rsp) # 64-byte Spill | |
vpermt2ps %zmm11, %zmm8, %zmm12 | |
vpermt2ps %zmm26, %zmm8, %zmm9 | |
vpermt2pd %zmm12, %zmm3, %zmm10 | |
vmovaps %zmm21, %zmm3 | |
vpermt2ps %zmm18, %zmm8, %zmm3 | |
vshufpd $32, %zmm9, %zmm6, %zmm15 # zmm15 = zmm6[0],zmm9[0],zmm6[2],zmm9[2],zmm6[4],zmm9[5],zmm6[6],zmm9[6] | |
vmovapd %zmm10, %zmm15 {%k4} | |
vmovups %zmm3, 2928(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm10 # zmm10 = zmm3[4,5,4,5,4,5,4,5] | |
vunpckhps %zmm27, %zmm7, %zmm3 # zmm3 = zmm7[2],zmm27[2],zmm7[3],zmm27[3],zmm7[6],zmm27[6],zmm7[7],zmm27[7],zmm7[10],zmm27[10],zmm7[11],zmm27[11],zmm7[14],zmm27[14],zmm7[15],zmm27[15] | |
vmovups %zmm3, 2864(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
vinsertf64x4 $0, %ymm22, %zmm0, %zmm0 | |
vblendpd $8, %ymm10, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2],ymm10[3] | |
vmovaps %zmm1, %zmm10 | |
vpermt2ps %zmm17, %zmm8, %zmm10 | |
vunpckhps %zmm28, %zmm4, %zmm8 # zmm8 = zmm4[2],zmm28[2],zmm4[3],zmm28[3],zmm4[6],zmm28[6],zmm4[7],zmm28[7],zmm4[10],zmm28[10],zmm4[11],zmm28[11],zmm4[14],zmm28[14],zmm4[15],zmm28[15] | |
vmovupd %zmm0, 944(%rsp) # 64-byte Spill | |
vunpcklps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[0],ymm17[0],ymm1[1],ymm17[1],ymm1[4],ymm17[4],ymm1[5],ymm17[5] | |
vmovups %zmm8, 2736(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm8, %xmm8 | |
vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2] | |
vmovups %zmm10, 2800(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm10, %xmm10 | |
vblendps $3, %xmm8, %xmm10, %xmm8 # xmm8 = xmm8[0,1],xmm10[2,3] | |
vblendpd $3, %ymm8, %ymm3, %ymm10 # ymm10 = ymm8[0,1],ymm3[2,3] | |
vmovaps %zmm4, %zmm8 | |
vpermt2ps %zmm28, %zmm20, %zmm8 | |
vunpckhps %zmm17, %zmm1, %zmm3 # zmm3 = zmm1[2],zmm17[2],zmm1[3],zmm17[3],zmm1[6],zmm17[6],zmm1[7],zmm17[7],zmm1[10],zmm17[10],zmm1[11],zmm17[11],zmm1[14],zmm17[14],zmm1[15],zmm17[15] | |
vmovups %zmm3, 2672(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm3, %xmm3 | |
vinsertf64x4 $0, %ymm10, %zmm15, %zmm22 | |
vmovaps %zmm7, %zmm10 | |
vpermt2ps %zmm27, %zmm29, %zmm10 | |
vmovaps %zmm7, %zmm15 | |
vpermt2ps %zmm27, %zmm20, %zmm15 | |
vmovaps .LCPI0_26(%rip), %ymm20 # ymm20 = [0,1,0,8,4,5,4,12] | |
vunpcklps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] | |
vmovups %zmm8, 2608(%rsp) # 64-byte Spill | |
vextractf32x4 $2, %zmm8, %xmm8 | |
vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3] | |
vmovups %ymm3, 2448(%rsp) # 32-byte Spill | |
vunpcklps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm28[0],ymm4[1],ymm28[1],ymm4[4],ymm28[4],ymm4[5],ymm28[5] | |
vextractf128 $1, %ymm3, %xmm3 | |
vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3] | |
vunpcklps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[0],zmm18[0],zmm21[1],zmm18[1],zmm21[4],zmm18[4],zmm21[5],zmm18[5],zmm21[8],zmm18[8],zmm21[9],zmm18[9],zmm21[12],zmm18[12],zmm21[13],zmm18[13] | |
vmovups %zmm3, 3376(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $170, %zmm10, %zmm10, %zmm8 # zmm8 = zmm10[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3] | |
vshuff64x2 $170, %zmm15, %zmm15, %zmm8 # zmm8 = zmm15[4,5,4,5,4,5,4,5] | |
vmovapd %ymm3, %ymm30 | |
vunpckhps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[2],zmm18[2],zmm21[3],zmm18[3],zmm21[6],zmm18[6],zmm21[7],zmm18[7],zmm21[10],zmm18[10],zmm21[11],zmm18[11],zmm21[14],zmm18[14],zmm21[15],zmm18[15] | |
vmovups %zmm3, 3312(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3] | |
vunpcklps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm27[0],ymm7[1],ymm27[1],ymm7[4],ymm27[4],ymm7[5],ymm27[5] | |
vmovupd %ymm3, 2416(%rsp) # 32-byte Spill | |
vunpcklpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[0],ymm21[0],ymm18[2],ymm21[2] | |
vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] | |
vmovups 2160(%rsp), %zmm8 # 64-byte Reload | |
vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] | |
vmovaps %zmm2, %zmm3 | |
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
vpermt2ps %zmm26, %zmm8, %zmm3 | |
vunpcklps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[4],ymm23[4],ymm24[5],ymm23[5] | |
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6] | |
vmovaps %ymm13, %ymm8 | |
vpermt2ps %ymm11, %ymm20, %ymm8 | |
vmovaps .LCPI0_24(%rip), %ymm20 # ymm20 = [0,1,2,10,4,5,6,14] | |
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
vunpckhps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[2],ymm27[2],ymm7[3],ymm27[3],ymm7[6],ymm27[6],ymm7[7],ymm27[7] | |
vunpckhps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] | |
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
vunpckhps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[2],ymm28[2],ymm4[3],ymm28[3],ymm4[6],ymm28[6],ymm4[7],ymm28[7] | |
vextractf128 $1, %ymm3, %xmm3 | |
vmovupd %zmm0, 1456(%rsp) # 64-byte Spill | |
vunpckhps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[2],ymm17[2],ymm1[3],ymm17[3],ymm1[6],ymm17[6],ymm1[7],ymm17[7] | |
vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2] | |
vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3] | |
vunpckhpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[1],ymm21[1],ymm18[3],ymm21[3] | |
vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] | |
vmovups 2224(%rsp), %zmm8 # 64-byte Reload | |
vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] | |
vmovaps %zmm2, %zmm3 | |
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
vpermt2ps %zmm26, %zmm8, %zmm3 | |
vunpckhps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[6],ymm23[6],ymm24[7],ymm23[7] | |
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6] | |
vmovaps %ymm13, %ymm8 | |
vpermt2ps %ymm11, %ymm20, %ymm8 | |
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
vmovsd .LCPI0_195(%rip), %xmm8 # xmm8 = [3,7,0,0] | |
vmovaps %ymm14, %ymm29 | |
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
vmovaps %xmm7, %xmm3 | |
vmovupd %zmm0, 160(%rsp) # 64-byte Spill | |
vunpckhps %xmm18, %xmm21, %xmm0 # xmm0 = xmm21[2],xmm18[2],xmm21[3],xmm18[3] | |
vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
vpermt2ps %xmm27, %xmm8, %xmm3 | |
vshufps $51, %xmm4, %xmm28, %xmm8 # xmm8 = xmm28[3,0],xmm4[3,0] | |
vinsertf128 $1, %xmm3, %ymm0, %ymm3 | |
vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] | |
vunpckhps %xmm17, %xmm1, %xmm3 # xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] | |
vshufps $226, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[2,0],xmm3[2,3] | |
vbroadcastsd .LCPI0_30(%rip), %ymm8 # ymm8 = [5,13,5,13,5,13,5,13] | |
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
vmovaps .LCPI0_31(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,u,u] | |
vmovaps %ymm0, %ymm20 | |
vmovaps %ymm7, %ymm0 | |
vpermt2ps %ymm27, %ymm3, %ymm0 | |
vunpcklps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[0],ymm18[0],ymm21[1],ymm18[1],ymm21[4],ymm18[4],ymm21[5],ymm18[5] | |
vblendps $192, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] | |
vmovaps %ymm1, %ymm3 | |
vpermt2ps %ymm17, %ymm8, %ymm3 | |
vmovaps %ymm4, %ymm8 | |
vpermt2ps %ymm28, %ymm31, %ymm8 | |
vmovaps .LCPI0_27(%rip), %ymm31 # ymm31 = [1,9,2,3,5,13,6,7] | |
vextractf128 $1, %ymm8, %xmm8 | |
vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3] | |
vmovups 2096(%rsp), %zmm8 # 64-byte Reload | |
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] | |
vmovaps %zmm24, %zmm3 | |
vpermt2ps %ymm5, %ymm31, %ymm29 | |
vmovapd .LCPI0_16(%rip), %zmm31 # zmm31 = [2,10,2,10,6,15,6,14] | |
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29 | |
vpermt2ps %zmm23, %zmm8, %zmm3 | |
vunpcklps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[0],ymm26[0],ymm2[1],ymm26[1],ymm2[4],ymm26[4],ymm2[5],ymm26[5] | |
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
vpermt2pd %zmm25, %zmm31, %zmm19 | |
vpermt2pd %zmm9, %zmm31, %zmm6 | |
vmovaps %zmm24, %zmm9 | |
vmovaps %zmm14, %zmm25 | |
vshufpd $32, %zmm8, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[5],zmm3[6],zmm8[6] | |
vunpcklps %ymm11, %ymm13, %ymm8 # ymm8 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] | |
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8 | |
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7] | |
vbroadcastsd .LCPI0_32(%rip), %zmm8 # zmm8 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
vmovups 1584(%rsp), %zmm29 # 64-byte Reload | |
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0 | |
vmovaps %zmm24, %zmm3 | |
vmovupd %zmm0, 1072(%rsp) # 64-byte Spill | |
vmovupd 2544(%rsp), %zmm0 # 64-byte Reload | |
vpermt2ps %zmm23, %zmm8, %zmm3 | |
vshufpd $128, %zmm16, %zmm0, %zmm19 {%k4} # zmm19 {%k4} = zmm0[0],zmm16[0],zmm0[2],zmm16[2],zmm0[4],zmm16[4],zmm0[6],zmm16[7] | |
vmovaps %zmm2, %zmm0 | |
vpermt2ps %zmm26, %zmm8, %zmm0 | |
vmovaps %zmm8, %zmm16 | |
vshufpd $32, %zmm0, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[5],zmm3[6],zmm0[6] | |
vmovaps %zmm13, %zmm0 | |
vpermt2ps %zmm11, %zmm8, %zmm0 | |
vmovaps %zmm14, %zmm8 | |
vpermt2ps %zmm5, %zmm16, %zmm8 | |
vmovaps .LCPI0_10(%rip), %ymm16 # ymm16 = [3,11,2,3,7,15,6,7] | |
vshufpd $128, %zmm0, %zmm8, %zmm3 {%k4} # zmm3 {%k4} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[7] | |
vmovupd 2480(%rsp), %zmm0 # 64-byte Reload | |
vmovups 2352(%rsp), %zmm8 # 64-byte Reload | |
vshufpd $128, %zmm12, %zmm0, %zmm6 {%k4} # zmm6 {%k4} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[7] | |
vmovaps %zmm24, %zmm0 | |
vpermt2ps %zmm23, %zmm8, %zmm24 | |
vunpckhps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[2],ymm26[2],ymm2[3],ymm26[3],ymm2[6],ymm26[6],ymm2[7],ymm26[7] | |
vunpckhps %ymm11, %ymm13, %ymm12 # ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] | |
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3] | |
vinsertf64x4 $1, %ymm12, %zmm0, %zmm12 | |
vshufpd $32, %zmm8, %zmm24, %zmm8 # zmm8 = zmm24[0],zmm8[0],zmm24[2],zmm8[2],zmm24[4],zmm8[5],zmm24[6],zmm8[6] | |
vmovaps %zmm14, %zmm24 | |
vpermt2ps %ymm5, %ymm16, %ymm14 | |
vmovaps .LCPI0_18(%rip), %zmm16 # zmm16 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
vinsertf64x4 $1, %ymm14, %zmm0, %zmm14 | |
vshufpd $128, %zmm12, %zmm14, %zmm8 {%k4} # zmm8 {%k4} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[7] | |
vunpcklps %zmm11, %zmm13, %zmm12 # zmm12 = zmm13[0],zmm11[0],zmm13[1],zmm11[1],zmm13[4],zmm11[4],zmm13[5],zmm11[5],zmm13[8],zmm11[8],zmm13[9],zmm11[9],zmm13[12],zmm11[12],zmm13[13],zmm11[13] | |
vunpckhps %zmm11, %zmm13, %zmm11 # zmm11 = zmm13[2],zmm11[2],zmm13[3],zmm11[3],zmm13[6],zmm11[6],zmm13[7],zmm11[7],zmm13[10],zmm11[10],zmm13[11],zmm11[11],zmm13[14],zmm11[14],zmm13[15],zmm11[15] | |
vunpcklps %zmm26, %zmm2, %zmm13 # zmm13 = zmm2[0],zmm26[0],zmm2[1],zmm26[1],zmm2[4],zmm26[4],zmm2[5],zmm26[5],zmm2[8],zmm26[8],zmm2[9],zmm26[9],zmm2[12],zmm26[12],zmm2[13],zmm26[13] | |
vunpckhps %zmm26, %zmm2, %zmm2 # zmm2 = zmm2[2],zmm26[2],zmm2[3],zmm26[3],zmm2[6],zmm26[6],zmm2[7],zmm26[7],zmm2[10],zmm26[10],zmm2[11],zmm26[11],zmm2[14],zmm26[14],zmm2[15],zmm26[15] | |
vinsertf64x4 $0, %ymm20, %zmm3, %zmm26 | |
vunpckhps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[2],ymm18[2],ymm21[3],ymm18[3],ymm21[6],ymm18[6],ymm21[7],ymm18[7] | |
vmovapd %zmm22, %zmm20 | |
vmovups 1648(%rsp), %zmm21 # 64-byte Reload | |
vpermt2ps %zmm23, %zmm16, %zmm0 | |
vpermt2ps %zmm5, %zmm16, %zmm24 | |
vmovapd .LCPI0_20(%rip), %zmm16 # zmm16 = [0,8,0,8,4,12,4,13] | |
vshufpd $32, %zmm13, %zmm0, %zmm14 # zmm14 = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[5],zmm0[6],zmm13[6] | |
vpermt2pd %zmm13, %zmm31, %zmm0 | |
vbroadcastsd .LCPI0_13(%rip), %ymm13 # ymm13 = [7,15,7,15,7,15,7,15] | |
vshufpd $128, %zmm12, %zmm24, %zmm0 {%k4} # zmm0 {%k4} = zmm24[0],zmm12[0],zmm24[2],zmm12[2],zmm24[4],zmm12[4],zmm24[6],zmm12[7] | |
vpermt2pd %zmm12, %zmm16, %zmm24 | |
vmovaps .LCPI0_15(%rip), %zmm12 # zmm12 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
vpermt2ps %ymm17, %ymm13, %ymm1 | |
vmovups 560(%rsp), %zmm17 # 64-byte Reload | |
vmovapd %zmm24, %zmm14 {%k4} | |
vmovups 1008(%rsp), %zmm24 # 64-byte Reload | |
vpermt2ps %zmm23, %zmm12, %zmm9 | |
vpermt2ps %zmm5, %zmm12, %zmm25 | |
vmovapd %ymm30, %ymm5 | |
vblendpd $3, 1424(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
# ymm5 = mem[0,1],ymm5[2,3] | |
vmovups 1712(%rsp), %zmm23 # 64-byte Reload | |
vmovups 944(%rsp), %zmm30 # 64-byte Reload | |
vshufpd $32, %zmm2, %zmm9, %zmm12 # zmm12 = zmm9[0],zmm2[0],zmm9[2],zmm2[2],zmm9[4],zmm2[5],zmm9[6],zmm2[6] | |
vpermt2pd %zmm2, %zmm31, %zmm9 | |
vmovupd 2416(%rsp), %ymm2 # 32-byte Reload | |
vmovups 32(%rsp), %zmm31 # 64-byte Reload | |
vblendpd $3, 2448(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload | |
# ymm2 = mem[0,1],ymm2[2,3] | |
vshufpd $128, %zmm11, %zmm25, %zmm9 {%k4} # zmm9 {%k4} = zmm25[0],zmm11[0],zmm25[2],zmm11[2],zmm25[4],zmm11[4],zmm25[6],zmm11[7] | |
vpermt2pd %zmm11, %zmm16, %zmm25 | |
vmovaps .LCPI0_14(%rip), %ymm11 # ymm11 = [3,11,2,3,7,15,u,u] | |
vmovapd %zmm25, %zmm12 {%k4} | |
vinsertf64x4 $0, %ymm2, %zmm12, %zmm18 | |
vextractf64x4 $1, %zmm10, %ymm2 | |
vmovups 688(%rsp), %zmm10 # 64-byte Reload | |
vmovups 416(%rsp), %zmm12 # 64-byte Reload | |
vmovups 1456(%rsp), %zmm25 # 64-byte Reload | |
vpermt2ps %ymm27, %ymm11, %ymm7 | |
vmovups 752(%rsp), %ymm11 # 32-byte Reload | |
vinsertf64x4 $0, %ymm5, %zmm14, %zmm27 | |
vblendps $192, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] | |
vmovups 1136(%rsp), %zmm7 # 64-byte Reload | |
vpermt2ps %ymm28, %ymm11, %ymm4 | |
vmovups 1328(%rsp), %zmm11 # 64-byte Reload | |
vmovups 288(%rsp), %zmm28 # 64-byte Reload | |
vextractf128 $1, %ymm4, %xmm4 | |
vblendps $3, %xmm4, %xmm1, %xmm1 # xmm1 = xmm4[0,1],xmm1[2,3] | |
vmovupd 3120(%rsp), %zmm4 # 64-byte Reload | |
vblendps $15, %ymm1, %ymm3, %ymm1 # ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] | |
vmovupd 816(%rsp), %zmm3 # 64-byte Reload | |
vinsertf64x4 $0, %ymm1, %zmm8, %zmm13 | |
vmovupd 352(%rsp), %zmm1 # 64-byte Reload | |
vmovups 96(%rsp), %zmm8 # 64-byte Reload | |
vextractf32x4 $3, %zmm4, %xmm4 | |
vextractf64x4 $1, %zmm3, %ymm3 | |
vextractf64x4 $1, %zmm1, %ymm1 | |
vblendpd $8, %ymm1, %ymm3, %ymm1 # ymm1 = ymm3[0,1,2],ymm1[3] | |
vmovupd 3184(%rsp), %zmm3 # 64-byte Reload | |
vextractf32x4 $3, %zmm3, %xmm3 | |
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
vmovupd 2736(%rsp), %zmm4 # 64-byte Reload | |
vblendpd $3, %ymm3, %ymm1, %ymm1 # ymm1 = ymm3[0,1],ymm1[2,3] | |
vmovupd 2992(%rsp), %zmm3 # 64-byte Reload | |
vinsertf64x4 $0, %ymm1, %zmm19, %zmm14 | |
vmovupd 3376(%rsp), %zmm1 # 64-byte Reload | |
vextractf32x4 $3, %zmm4, %xmm4 | |
vextractf32x4 $3, %zmm3, %xmm3 | |
vextractf64x4 $1, %zmm1, %ymm1 | |
vblendpd $8, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2],ymm1[3] | |
vmovupd 3056(%rsp), %zmm2 # 64-byte Reload | |
vextractf32x4 $3, %zmm2, %xmm2 | |
vblendpd $1, %xmm3, %xmm2, %xmm2 # xmm2 = xmm3[0],xmm2[1] | |
vmovupd 2864(%rsp), %zmm3 # 64-byte Reload | |
vblendpd $3, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0,1],ymm1[2,3] | |
vmovupd 2928(%rsp), %zmm2 # 64-byte Reload | |
vinsertf64x4 $0, %ymm1, %zmm0, %zmm19 | |
vmovaps .LCPI0_120(%rip), %ymm0 # ymm0 = [0,1,2,3,13,u,u,u] | |
vmovups 224(%rsp), %ymm1 # 32-byte Reload | |
vextractf64x4 $1, %zmm3, %ymm3 | |
vextractf64x4 $1, %zmm2, %ymm2 | |
vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3] | |
vmovupd 2800(%rsp), %zmm3 # 64-byte Reload | |
vextractf32x4 $3, %zmm3, %xmm3 | |
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
vmovupd 2608(%rsp), %zmm4 # 64-byte Reload | |
vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] | |
vextractf64x4 $1, %zmm15, %ymm3 | |
vmovups 1264(%rsp), %zmm15 # 64-byte Reload | |
vinsertf64x4 $0, %ymm2, %zmm6, %zmm22 | |
vmovupd 3312(%rsp), %zmm2 # 64-byte Reload | |
vmovaps .LCPI0_122(%rip), %ymm6 # ymm6 = [0,1,2,3,4,5,6,13] | |
vextractf32x4 $3, %zmm4, %xmm4 | |
vpermt2ps %ymm15, %ymm0, %ymm1 | |
vextractf64x4 $1, %zmm2, %ymm2 | |
vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3] | |
vmovupd 2672(%rsp), %zmm3 # 64-byte Reload | |
vblendps $32, %ymm10, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3,4],ymm10[5],ymm1[6,7] | |
vmovaps .LCPI0_121(%rip), %ymm1 # ymm1 = [0,1,2,3,4,5,13,u] | |
vextractf32x4 $3, %zmm3, %xmm3 | |
vpermt2ps %ymm8, %ymm1, %ymm0 | |
vmovups 624(%rsp), %zmm1 # 64-byte Reload | |
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1] | |
vmovaps .LCPI0_123(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,21,u,u,u,u,u,u,u] | |
vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] | |
vmovaps .LCPI0_124(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,21,u,u,u,u,u,u] | |
vinsertf64x4 $0, %ymm2, %zmm9, %zmm16 | |
vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_106(%rip){1to16}, %zmm9, %k1 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %ymm1, %ymm6, %ymm0 | |
vpermt2ps %zmm21, %zmm4, %zmm0 | |
vmovaps .LCPI0_125(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,21,u,u,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm2 {%k1} {z} | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_7(%rip){1to16}, %zmm9, %k1 | |
vmovups 1840(%rsp), %zmm9 # 64-byte Reload | |
movq -48(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm29, %zmm3, %zmm0 | |
vmovaps .LCPI0_126(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,21,u,u,u,u] | |
vpermt2ps %zmm23, %zmm4, %zmm0 | |
vmovaps .LCPI0_127(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,21,u,u,u] | |
vpermt2ps %zmm7, %zmm3, %zmm0 | |
vmovaps .LCPI0_128(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,21,u,u] | |
vpermt2ps %zmm11, %zmm4, %zmm0 | |
vmovaps .LCPI0_129(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,21,u] | |
vpermt2ps %zmm28, %zmm3, %zmm0 | |
vmovaps .LCPI0_130(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,21] | |
vpermt2ps %zmm12, %zmm4, %zmm0 | |
vpermt2ps %zmm17, %zmm3, %zmm0 | |
vshufps $255, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
vfmadd231ps %zmm4, %zmm0, %zmm18 # zmm18 = (zmm0 * zmm4) + zmm18 | |
vshufps $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vfmadd231ps %zmm3, %zmm0, %zmm16 # zmm16 = (zmm0 * zmm3) + zmm16 | |
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm4, %zmm4, %zmm3 # zmm3 = zmm4[6,7,6,7,6,7,6,7] | |
vshufps $255, %xmm2, %xmm2, %xmm4 # xmm4 = xmm2[3,3,3,3] | |
vfmadd231ps %zmm5, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm5) + zmm20 | |
vshufps $85, %zmm2, %zmm2, %zmm5 # zmm5 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
vfmadd231ps %zmm3, %zmm0, %zmm22 # zmm22 = (zmm0 * zmm3) + zmm22 | |
vshuff64x2 $170, %zmm5, %zmm5, %zmm6 # zmm6 = zmm5[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm5, %zmm5, %zmm3 # zmm3 = zmm5[6,7,6,7,6,7,6,7] | |
vmovups %zmm16, 944(%rsp) # 64-byte Spill | |
vmovaps 2032(%rsp), %xmm16 # 16-byte Reload | |
vmovaps .LCPI0_132(%rip), %ymm5 # ymm5 = [0,1,2,3,4,14,u,u] | |
vfmadd231ps %zmm6, %zmm0, %zmm27 # zmm27 = (zmm0 * zmm6) + zmm27 | |
vextractf32x4 $2, %zmm2, %xmm6 | |
vfmadd231ps %zmm3, %zmm0, %zmm19 # zmm19 = (zmm0 * zmm3) + zmm19 | |
vextractf32x4 $3, %zmm2, %xmm3 | |
vmovups %zmm20, 816(%rsp) # 64-byte Spill | |
vmovups 1520(%rsp), %zmm20 # 64-byte Reload | |
vbroadcastss %xmm6, %zmm6 | |
vbroadcastss %xmm3, %zmm3 | |
vfmadd231ps %zmm6, %zmm0, %zmm30 # zmm30 = (zmm0 * zmm6) + zmm30 | |
vshufps $170, %xmm2, %xmm2, %xmm6 # xmm6 = xmm2[2,2,2,2] | |
vfmadd231ps %zmm3, %zmm0, %zmm14 # zmm14 = (zmm0 * zmm3) + zmm14 | |
vshufps $255, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[3,3,3,3,7,7,7,7] | |
vbroadcastsd %xmm6, %zmm6 | |
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
vfmadd231ps %zmm6, %zmm0, %zmm31 # zmm31 = (zmm0 * zmm6) + zmm31 | |
vmovshdup %xmm2, %xmm6 # xmm6 = xmm2[1,1,3,3] | |
vfmadd231ps %zmm3, %zmm0, %zmm13 # zmm13 = (zmm0 * zmm3) + zmm13 | |
vshufps $85, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[1,1,1,1,5,5,5,5] | |
vbroadcastsd %xmm6, %zmm6 | |
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
vmovups %zmm14, 352(%rsp) # 64-byte Spill | |
vmovaps %zmm11, %zmm14 | |
vfmadd231ps %zmm6, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm6) + zmm20 | |
vbroadcastss %xmm2, %zmm6 | |
vmovups %zmm13, 32(%rsp) # 64-byte Spill | |
vmovups 1776(%rsp), %zmm13 # 64-byte Reload | |
vfmadd231ps %zmm6, %zmm0, %zmm24 # zmm24 = (zmm0 * zmm6) + zmm24 | |
vmovups 1072(%rsp), %zmm6 # 64-byte Reload | |
vfmadd231ps %zmm3, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm3) + zmm6 | |
vbroadcastsd %xmm4, %zmm3 | |
vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26 | |
vshufps $170, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[2,2,2,2,6,6,6,6] | |
vextractf128 $1, %ymm2, %xmm2 | |
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm2, %zmm2 | |
vmovups %zmm26, 224(%rsp) # 64-byte Spill | |
vmovups 160(%rsp), %zmm26 # 64-byte Reload | |
vfmadd231ps %zmm2, %zmm0, %zmm25 # zmm25 = (zmm0 * zmm2) + zmm25 | |
vmovapd .LCPI0_131(%rip), %ymm2 # ymm2 = [0,1,7,u] | |
vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26 | |
vunpckhps %ymm13, %ymm9, %ymm0 # ymm0 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7] | |
vextractf128 $1, %ymm0, %xmm0 | |
vblendps $12, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
# xmm0 = xmm0[0,1],mem[2,3] | |
vinsertps $176, %xmm16, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm16[2] | |
vpermt2pd %ymm15, %ymm2, %ymm0 | |
vmovaps %zmm23, %zmm15 | |
vpermt2ps %ymm10, %ymm5, %ymm0 | |
vblendps $192, %ymm8, %ymm0, %ymm10 # ymm10 = ymm0[0,1,2,3,4,5],ymm8[6,7] | |
vmovaps .LCPI0_133(%rip), %ymm0 # ymm0 = [0,1,2,3,4,5,6,14] | |
vmovaps %zmm12, %zmm8 | |
vpermt2ps %ymm1, %ymm0, %ymm10 | |
vmovapd .LCPI0_134(%rip), %zmm0 # zmm0 = [0,1,2,3,11,u,u,u] | |
vpermt2pd %zmm21, %zmm0, %zmm10 | |
vmovaps .LCPI0_135(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,22,u,u,u,u,u,u] | |
vmovups 816(%rsp), %zmm21 # 64-byte Reload | |
vpermt2ps %zmm29, %zmm0, %zmm10 | |
vmovapd .LCPI0_136(%rip), %zmm0 # zmm0 = [0,1,2,3,4,11,u,u] | |
vpermt2pd %zmm23, %zmm0, %zmm10 | |
vmovaps .LCPI0_137(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,22,u,u,u,u] | |
vmovaps %zmm9, %zmm23 | |
vpermt2ps %zmm7, %zmm0, %zmm10 | |
vmovapd .LCPI0_138(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,11,u] | |
vpermt2pd %zmm11, %zmm0, %zmm10 | |
vmovaps .LCPI0_139(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,22,u,u] | |
vmovups 352(%rsp), %zmm11 # 64-byte Reload | |
vpermt2ps %zmm28, %zmm0, %zmm10 | |
vmovapd .LCPI0_140(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,11] | |
vmovaps %zmm17, %zmm28 | |
vpermt2pd %zmm12, %zmm0, %zmm10 | |
vmovaps .LCPI0_141(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,22] | |
vpermt2ps %zmm17, %zmm0, %zmm10 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
vmovaps %zmm27, %zmm17 | |
vmovups 32(%rsp), %zmm27 # 64-byte Reload | |
movq 8(%rsp), %rax # 8-byte Reload | |
vbroadcastss %xmm0, %zmm2 | |
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] | |
vshufps $170, %xmm0, %xmm0, %xmm3 # xmm3 = xmm0[2,2,2,2] | |
vextractf128 $1, %ymm0, %xmm7 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm1, %zmm2 | |
vextractf32x4 $2, %zmm0, %xmm1 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm2) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm3, %zmm2 | |
vmovups %zmm24, 1008(%rsp) # 64-byte Spill | |
vmovaps %zmm31, %zmm24 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm1, %zmm2 | |
vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm2) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
vmovups %zmm20, 1520(%rsp) # 64-byte Spill | |
vmovups 224(%rsp), %zmm20 # 64-byte Reload | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm10, %zmm17 # zmm17 = (zmm10 * zmm3) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vmovaps %zmm30, %zmm31 | |
vmovaps %zmm6, %zmm30 | |
vmovaps %zmm13, %zmm6 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm10, %zmm21 # zmm21 = (zmm10 * zmm4) + zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm10, %zmm18 # zmm18 = (zmm10 * zmm5) + zmm18 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm7, %zmm5 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vunpcklps %zmm13, %zmm9, %zmm7 # zmm7 = zmm9[0],zmm13[0],zmm9[1],zmm13[1],zmm9[4],zmm13[4],zmm9[5],zmm13[5],zmm9[8],zmm13[8],zmm9[9],zmm13[9],zmm9[12],zmm13[12],zmm9[13],zmm13[13] | |
vfmadd231ps %zmm5, %zmm10, %zmm25 # zmm25 = (zmm10 * zmm5) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
vmovaps %zmm18, %zmm29 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vunpckhps %zmm13, %zmm9, %zmm18 # zmm18 = zmm9[2],zmm13[2],zmm9[3],zmm13[3],zmm9[6],zmm13[6],zmm9[7],zmm13[7],zmm9[10],zmm13[10],zmm9[11],zmm13[11],zmm9[14],zmm13[14],zmm9[15],zmm13[15] | |
vfmadd231ps %zmm5, %zmm10, %zmm26 # zmm26 = (zmm10 * zmm5) + zmm26 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm1, %zmm5 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps %zmm9, %zmm1 | |
vmovups %zmm25, 1456(%rsp) # 64-byte Spill | |
vmovups 1584(%rsp), %zmm25 # 64-byte Reload | |
vfmadd231ps %zmm5, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm5) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm5) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[3,3,3,3,7,7,7,7] | |
vextractf32x4 $3, %zmm0, %xmm0 | |
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm10, %zmm27 # zmm27 = (zmm10 * zmm5) + zmm27 | |
vfmadd231ps %zmm0, %zmm10, %zmm11 # zmm11 = (zmm10 * zmm0) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7] | |
vmovups 752(%rsp), %ymm2 # 32-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm10, %zmm19 # zmm19 = (zmm10 * zmm0) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_144(%rip), %ymm3 # ymm3 = [0,1,2,3,4,5,15,u] | |
vfmadd231ps %zmm0, %zmm10, %zmm22 # zmm22 = (zmm10 * zmm0) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm4, %zmm4, %zmm0 # zmm0 = zmm4[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps 944(%rsp), %zmm0, %zmm10 # 64-byte Folded Reload | |
# zmm10 = (zmm0 * zmm10) + mem | |
vmovaps .LCPI0_143(%rip), %ymm4 # ymm4 = [0,1,2,3,4,15,u,u] | |
vpermt2ps %ymm13, %ymm2, %ymm9 | |
vmovaps %xmm16, %xmm13 | |
vmovaps .LCPI0_142(%rip), %ymm2 # ymm2 = [0,1,2,3,15,u,u,u] | |
vmovups 1648(%rsp), %zmm16 # 64-byte Reload | |
vextractf128 $1, %ymm9, %xmm0 | |
vshufps $244, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload | |
# xmm0 = xmm0[0,1],mem[3,3] | |
vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload | |
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_105(%rip){1to16}, %zmm9, %k1 | |
vpcmpgtd .LCPI0_2(%rip){1to16}, %zmm9, %k2 | |
vpcmpgtd .LCPI0_3(%rip){1to16}, %zmm9, %k5 | |
vpcmpgtd .LCPI0_4(%rip){1to16}, %zmm9, %k6 | |
vpcmpgtd .LCPI0_5(%rip){1to16}, %zmm9, %k7 | |
vpcmpgtd .LCPI0_6(%rip){1to16}, %zmm9, %k3 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vblendps $8, %xmm13, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm13[3] | |
vmovups 1264(%rsp), %zmm13 # 64-byte Reload | |
vpermt2ps %ymm13, %ymm2, %ymm0 | |
vpermt2ps 688(%rsp), %ymm4, %ymm0 # 32-byte Folded Reload | |
vmovaps .LCPI0_146(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,23,u,u,u,u,u,u] | |
vpermt2ps 96(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload | |
vmovaps .LCPI0_145(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,23,u,u,u,u,u,u,u] | |
vblendps $128, 624(%rsp), %ymm0, %ymm12 # 32-byte Folded Reload | |
# ymm12 = ymm0[0,1,2,3,4,5,6],mem[7] | |
vmovaps .LCPI0_149(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,23,u,u,u] | |
vpermt2ps %zmm16, %zmm3, %zmm12 | |
vmovaps .LCPI0_147(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,23,u,u,u,u,u] | |
vpermt2ps %zmm25, %zmm2, %zmm12 | |
vmovaps .LCPI0_148(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,23,u,u,u,u] | |
vpermt2ps %zmm15, %zmm3, %zmm12 | |
vmovaps .LCPI0_150(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,23,u,u] | |
vmovups 1520(%rsp), %zmm15 # 64-byte Reload | |
vpermt2ps 1136(%rsp), %zmm2, %zmm12 # 64-byte Folded Reload | |
vmovaps .LCPI0_152(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,23] | |
vpermt2ps %zmm14, %zmm0, %zmm12 | |
vpermt2ps 288(%rsp), %zmm3, %zmm12 # 64-byte Folded Reload | |
vmovaps .LCPI0_151(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,23,u] | |
vmovups 1008(%rsp), %zmm14 # 64-byte Reload | |
vpermt2ps %zmm8, %zmm0, %zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_103(%rip){1to16}, %zmm9, %k1 | |
vmovaps %zmm31, %zmm8 | |
vmovups 1136(%rsp), %zmm31 # 64-byte Reload | |
movq -40(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm2, %zmm12 | |
vmovupd 1328(%rsp), %zmm28 # 64-byte Reload | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
kmovw %k1, 1008(%rsp) # 2-byte Spill | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_1(%rip){1to16}, %zmm9, %k1 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm12, %zmm29 # zmm29 = (zmm12 * zmm3) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm10 # zmm10 = (zmm12 * zmm2) + zmm10 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm2 # zmm2 = zmm3[6,7,6,7,6,7,6,7] | |
vextractf32x4 $3, %zmm0, %xmm3 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm12, %zmm21 # zmm21 = (zmm12 * zmm4) + zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm22 # zmm22 = (zmm12 * zmm2) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7] | |
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5] | |
vshufps $255, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[3,3,3,3] | |
kmovw %k1, 880(%rsp) # 2-byte Spill | |
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51 | |
vpcmpgtd .LCPI0_104(%rip){1to16}, %zmm9, %k1 | |
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm19 # zmm19 = (zmm12 * zmm2) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm3, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
vfmadd231ps %zmm5, %zmm12, %zmm17 # zmm17 = (zmm12 * zmm5) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $2, %zmm0, %xmm5 | |
vmovups %zmm21, 816(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm11 # zmm11 = (zmm12 * zmm2) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
vbroadcastss %xmm5, %zmm5 | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vmovaps %zmm17, %zmm21 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps %zmm7, %zmm17 | |
vfmadd231ps %zmm5, %zmm12, %zmm8 # zmm8 = (zmm12 * zmm5) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm0, %xmm0, %xmm5 # xmm5 = xmm0[2,2,2,2] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm27 # zmm27 = (zmm12 * zmm2) + zmm27 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5] | |
vbroadcastsd %xmm5, %zmm5 | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vmovups %zmm11, 352(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm12, %zmm24 # zmm24 = (zmm12 * zmm5) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm0, %xmm5 # xmm5 = xmm0[1,1,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm30 # zmm30 = (zmm12 * zmm2) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm4, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm6, %zmm3, %zmm1 | |
vmovups 1968(%rsp), %zmm3 # 64-byte Reload | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm5, %zmm5 | |
vmovups %zmm27, 32(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm20 # zmm20 = (zmm12 * zmm2) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[2,2,2,2,6,6,6,6] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm5) + zmm15 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm0, %zmm5 | |
vextractf128 $1, %ymm0, %xmm0 | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm12, %zmm14 # zmm14 = (zmm12 * zmm5) + zmm14 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm0, %zmm0 | |
vmovups 1904(%rsp), %zmm5 # 64-byte Reload | |
vmovups %zmm30, 1072(%rsp) # 64-byte Spill | |
vmovupd 1712(%rsp), %zmm30 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm12, %zmm26 # zmm26 = (zmm12 * zmm2) + zmm26 | |
vmovaps .LCPI0_15(%rip), %zmm2 # zmm2 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
vmovaps %zmm1, %zmm11 | |
vfmadd213ps 1456(%rsp), %zmm0, %zmm12 # 64-byte Folded Reload | |
# zmm12 = (zmm0 * zmm12) + mem | |
vmovaps .LCPI0_34(%rip), %xmm0 # xmm0 = [8,9,25,u] | |
vmovaps %zmm26, %zmm27 | |
vmovups 288(%rsp), %zmm26 # 64-byte Reload | |
vpermt2ps %zmm6, %zmm2, %zmm23 | |
vmovapd .LCPI0_58(%rip), %xmm2 # xmm2 = [6,14] | |
vmovaps %zmm18, %zmm6 | |
vpermt2ps %zmm3, %zmm0, %zmm11 | |
vmovapd .LCPI0_41(%rip), %xmm0 # xmm0 = [4,13] | |
vmovaps %zmm23, %zmm9 | |
vpermt2pd %zmm3, %zmm2, %zmm17 | |
vmovaps .LCPI0_67(%rip), %xmm2 # xmm2 = [12,13,29,u] | |
vpermt2pd %zmm3, %zmm0, %zmm6 | |
vmovaps .LCPI0_49(%rip), %xmm0 # xmm0 = [8,9,27,u] | |
vpermt2ps %zmm3, %zmm2, %zmm1 | |
vmovapd .LCPI0_78(%rip), %xmm2 # xmm2 = [6,15] | |
vpermt2ps %zmm3, %zmm0, %zmm9 | |
vmovaps .LCPI0_73(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,29,u,u,u,u,u,u,u] | |
vpermt2pd %zmm3, %zmm2, %zmm18 | |
vmovaps .LCPI0_90(%rip), %xmm2 # xmm2 = [12,13,31,u] | |
vpermt2ps %zmm3, %zmm2, %zmm23 | |
vmovapd .LCPI0_153(%rip), %xmm2 # xmm2 = [4,12] | |
vpermt2pd %zmm3, %zmm2, %zmm7 | |
vmovaps .LCPI0_35(%rip), %xmm2 # xmm2 = [0,1,2,25] | |
vmovups 688(%rsp), %zmm3 # 64-byte Reload | |
vpermt2ps %zmm5, %zmm2, %zmm11 | |
vmovaps .LCPI0_42(%rip), %xmm2 # xmm2 = [0,1,2,26] | |
vpermt2ps %zmm5, %zmm2, %zmm6 | |
vmovaps .LCPI0_59(%rip), %xmm2 # xmm2 = [0,1,2,28] | |
vpermt2ps %zmm5, %zmm2, %zmm17 | |
vmovaps .LCPI0_68(%rip), %xmm2 # xmm2 = [0,1,2,29] | |
vpermt2ps %zmm5, %zmm2, %zmm1 | |
vmovaps .LCPI0_79(%rip), %xmm2 # xmm2 = [0,1,2,30] | |
vpermt2ps %zmm5, %zmm2, %zmm18 | |
vmovaps .LCPI0_91(%rip), %xmm2 # xmm2 = [0,1,2,31] | |
vpermt2ps %zmm5, %zmm2, %zmm23 | |
vmovaps .LCPI0_50(%rip), %xmm2 # xmm2 = [0,1,2,27] | |
vpermt2ps %zmm5, %zmm2, %zmm9 | |
vmovaps .LCPI0_154(%rip), %xmm2 # xmm2 = [0,1,2,24] | |
vpermt2ps %zmm5, %zmm2, %zmm7 | |
vmovaps .LCPI0_36(%rip), %ymm2 # ymm2 = [0,1,2,3,25,u,u,u] | |
vmovups 624(%rsp), %zmm5 # 64-byte Reload | |
vpermt2ps %zmm13, %zmm2, %zmm11 | |
vmovapd .LCPI0_43(%rip), %ymm2 # ymm2 = [0,1,13,u] | |
vpermt2pd %zmm13, %zmm2, %zmm6 | |
vmovapd .LCPI0_60(%rip), %ymm2 # ymm2 = [0,1,14,u] | |
vpermt2pd %zmm13, %zmm2, %zmm17 | |
vmovaps .LCPI0_69(%rip), %ymm2 # ymm2 = [0,1,2,3,29,u,u,u] | |
vpermt2ps %zmm13, %zmm2, %zmm1 | |
vmovaps .LCPI0_51(%rip), %ymm2 # ymm2 = [0,1,2,3,27,u,u,u] | |
vpermt2ps %zmm13, %zmm2, %zmm9 | |
vmovapd .LCPI0_80(%rip), %ymm2 # ymm2 = [0,1,15,u] | |
vpermt2pd %zmm13, %zmm2, %zmm18 | |
vmovaps .LCPI0_92(%rip), %ymm2 # ymm2 = [0,1,2,3,31,u,u,u] | |
vpermt2ps %zmm13, %zmm2, %zmm23 | |
vmovapd .LCPI0_155(%rip), %ymm2 # ymm2 = [0,1,12,u] | |
vpermt2pd %zmm13, %zmm2, %zmm7 | |
vmovaps .LCPI0_37(%rip), %ymm2 # ymm2 = [0,1,2,3,4,25,u,u] | |
vmovups 96(%rsp), %zmm13 # 64-byte Reload | |
vpermt2ps %zmm3, %zmm2, %zmm11 | |
vmovaps .LCPI0_44(%rip), %ymm2 # ymm2 = [0,1,2,3,4,26,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm6 | |
vmovaps .LCPI0_52(%rip), %ymm2 # ymm2 = [0,1,2,3,4,27,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm9 | |
vmovaps .LCPI0_61(%rip), %ymm2 # ymm2 = [0,1,2,3,4,28,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm17 | |
vmovaps .LCPI0_70(%rip), %ymm2 # ymm2 = [0,1,2,3,4,29,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm1 | |
vmovaps .LCPI0_81(%rip), %ymm2 # ymm2 = [0,1,2,3,4,30,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm18 | |
vmovaps .LCPI0_93(%rip), %ymm2 # ymm2 = [0,1,2,3,4,31,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm23 | |
vmovaps .LCPI0_156(%rip), %ymm2 # ymm2 = [0,1,2,3,4,24,u,u] | |
vpermt2ps %zmm3, %zmm2, %zmm7 | |
vmovaps .LCPI0_38(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,25,u] | |
vmovapd .LCPI0_47(%rip), %zmm3 # zmm3 = [0,1,2,3,13,u,u,u] | |
vpermt2ps %zmm13, %zmm2, %zmm11 | |
vmovapd .LCPI0_45(%rip), %ymm2 # ymm2 = [0,1,2,13] | |
vpermt2pd %zmm13, %zmm2, %zmm6 | |
vmovapd .LCPI0_62(%rip), %ymm2 # ymm2 = [0,1,2,14] | |
vpermt2pd %zmm13, %zmm2, %zmm17 | |
vmovaps .LCPI0_71(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,29,u] | |
vpermt2ps %zmm13, %zmm2, %zmm1 | |
vmovapd .LCPI0_82(%rip), %ymm2 # ymm2 = [0,1,2,15] | |
vpermt2pd %zmm13, %zmm2, %zmm18 | |
vmovaps .LCPI0_94(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,31,u] | |
vpermt2ps %zmm13, %zmm2, %zmm23 | |
vmovaps .LCPI0_53(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,27,u] | |
vpermt2ps %zmm13, %zmm2, %zmm9 | |
vmovapd .LCPI0_157(%rip), %ymm2 # ymm2 = [0,1,2,12] | |
vpermt2pd %zmm13, %zmm2, %zmm7 | |
vmovaps .LCPI0_39(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,25] | |
vpermt2ps %zmm5, %zmm2, %zmm11 | |
vmovaps .LCPI0_46(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,26] | |
vpermt2ps %zmm5, %zmm2, %zmm6 | |
vmovaps .LCPI0_63(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,28] | |
vpermt2pd %zmm16, %zmm3, %zmm6 | |
vmovaps .LCPI0_48(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,26,u,u,u,u,u,u] | |
vpermt2ps %zmm5, %zmm2, %zmm17 | |
vmovaps .LCPI0_72(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,29] | |
vpermt2ps %zmm25, %zmm3, %zmm6 | |
vmovaps .LCPI0_74(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,29,u,u,u,u,u,u] | |
vpermt2ps %zmm5, %zmm2, %zmm1 | |
vmovaps .LCPI0_54(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,27] | |
vpermt2ps %zmm16, %zmm0, %zmm1 | |
vmovapd .LCPI0_84(%rip), %zmm0 # zmm0 = [0,1,2,3,15,u,u,u] | |
vpermt2ps %zmm5, %zmm2, %zmm9 | |
vmovaps .LCPI0_83(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,30] | |
vpermt2ps %zmm25, %zmm3, %zmm1 | |
vmovaps .LCPI0_56(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,27,u,u,u,u,u,u] | |
vpermt2ps %zmm5, %zmm2, %zmm18 | |
vmovaps .LCPI0_95(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,31] | |
vpermt2pd %zmm16, %zmm0, %zmm18 | |
vmovaps .LCPI0_96(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,31,u,u,u,u,u,u,u] | |
vpermt2ps %zmm5, %zmm2, %zmm23 | |
vmovaps .LCPI0_158(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,24] | |
vpermt2ps %zmm16, %zmm0, %zmm23 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm0 {%k3} {z} | |
kmovw -114(%rsp), %k3 # 2-byte Reload | |
movq 16(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm5, %zmm2, %zmm7 | |
vmovaps .LCPI0_40(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,25,u,u,u,u,u,u,u] | |
vshuff64x2 $228, %zmm16, %zmm7, %zmm13 # zmm13 = zmm7[0,1,2,3],zmm16[4,5,6,7] | |
vmovaps .LCPI0_64(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,28,u,u,u,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm0, %zmm2 | |
vshufps $170, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[2,2,2,2] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm16, %zmm5, %zmm11 | |
vmovaps .LCPI0_55(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,27,u,u,u,u,u,u,u] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovaps %zmm25, %zmm11 {%k3} | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm16, %zmm5, %zmm9 | |
vshuff64x2 $244, %zmm16, %zmm17, %zmm5 # zmm5 = zmm17[0,1,2,3],zmm16[6,7,6,7] | |
vmovaps .LCPI0_161(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,24,u,u,u,u] | |
vmovups 560(%rsp), %zmm17 # 64-byte Reload | |
vpermt2ps %zmm25, %zmm7, %zmm5 | |
vmovaps .LCPI0_85(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,30,u,u,u,u,u,u] | |
vpermt2ps %zmm25, %zmm3, %zmm9 | |
vmovaps .LCPI0_159(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,24,u,u,u,u,u,u] | |
vpermt2ps %zmm25, %zmm7, %zmm18 | |
vmovaps .LCPI0_97(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,31,u,u,u,u,u,u] | |
vpermt2ps %zmm25, %zmm3, %zmm13 | |
vmovapd .LCPI0_160(%rip), %zmm3 # zmm3 = [0,1,2,3,4,12,u,u] | |
vpermt2ps %zmm25, %zmm7, %zmm23 | |
vmovapd .LCPI0_162(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,12,u] | |
vmovupd 416(%rsp), %zmm25 # 64-byte Reload | |
vpermt2pd %zmm30, %zmm3, %zmm13 | |
vmovaps .LCPI0_165(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,24] | |
vpermt2ps %zmm31, %zmm16, %zmm13 | |
vmovaps .LCPI0_163(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,11,12,24,u,u] | |
vpermt2pd %zmm28, %zmm7, %zmm13 | |
vmovapd .LCPI0_164(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,12] | |
vpermt2ps %zmm26, %zmm16, %zmm13 | |
vmovaps %zmm21, %zmm16 | |
vmovups 816(%rsp), %zmm21 # 64-byte Reload | |
vpermt2pd %zmm25, %zmm7, %zmm13 | |
vpermt2ps %zmm17, %zmm3, %zmm13 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm13, %zmm14 # zmm14 = (zmm13 * zmm2) + zmm14 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm3, %zmm2 | |
vextractf32x4 $2, %zmm0, %xmm3 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm13, %zmm15 # zmm15 = (zmm13 * zmm2) + zmm15 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm4, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm13, %zmm24 # zmm24 = (zmm13 * zmm2) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm3, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm13, %zmm8 # zmm8 = (zmm13 * zmm2) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm13, %zmm16 # zmm16 = (zmm13 * zmm3) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm13, %zmm19 # zmm19 = (zmm13 * zmm2) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm13, %zmm21 # zmm21 = (zmm13 * zmm4) + zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm29 # zmm29 = (zmm13 * zmm7) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf128 $1, %ymm0, %xmm7 | |
vbroadcastss %xmm7, %zmm7 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm12 # zmm12 = (zmm13 * zmm7) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27 | |
vmovups %zmm27, 160(%rsp) # 64-byte Spill | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %xmm0, %xmm0, %xmm27 # xmm27 = xmm0[3,3,3,3] | |
vbroadcastsd %xmm27, %zmm7 | |
vmovups 32(%rsp), %zmm27 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
vmovups %zmm20, 224(%rsp) # 64-byte Spill | |
vmovups 1072(%rsp), %zmm20 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[3,3,3,3,7,7,7,7] | |
vextractf32x4 $3, %zmm0, %xmm0 | |
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27 | |
vmovapd .LCPI0_65(%rip), %zmm7 # zmm7 = [0,1,2,3,4,14,u,u] | |
vmovups %zmm27, 32(%rsp) # 64-byte Spill | |
vmovups 352(%rsp), %zmm27 # 64-byte Reload | |
vpermt2pd %zmm30, %zmm7, %zmm5 | |
vmovapd .LCPI0_86(%rip), %zmm7 # zmm7 = [0,1,2,3,4,15,u,u] | |
vfmadd231ps %zmm0, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm0) + zmm27 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_75(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,29,u,u,u,u,u] | |
vfmadd231ps %zmm0, %zmm13, %zmm22 # zmm22 = (zmm13 * zmm0) + zmm22 | |
vfmadd213ps %zmm10, %zmm4, %zmm13 # zmm13 = (zmm4 * zmm13) + zmm10 | |
vmovaps .LCPI0_166(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,25,u,u,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm0 {%k1} {z} | |
kmovw -116(%rsp), %k1 # 2-byte Reload | |
movq -24(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2pd %zmm30, %zmm7, %zmm18 | |
vmovaps .LCPI0_167(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,25,u,u,u,u] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovapd %zmm30, %zmm6 {%k1} | |
kmovw -118(%rsp), %k1 # 2-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm30, %zmm3, %zmm1 | |
vmovaps .LCPI0_98(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,31,u,u,u,u,u] | |
vpermt2ps %zmm30, %zmm10, %zmm11 | |
vmovaps .LCPI0_169(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,25,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm31, %zmm7, %zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm7 # zmm7 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm30, %zmm3, %zmm23 | |
vmovaps .LCPI0_57(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,27,u,u,u,u,u] | |
vpermt2ps %zmm30, %zmm3, %zmm9 | |
vmovaps .LCPI0_168(%rip), %zmm30 # zmm30 = [0,1,2,3,4,5,6,7,8,9,10,11,25,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm30, %zmm11 | |
vpermt2ps %zmm26, %zmm10, %zmm11 | |
vmovaps .LCPI0_170(%rip), %zmm26 # zmm26 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,25,u] | |
vmovaps .LCPI0_171(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,25] | |
vpermt2ps %zmm25, %zmm26, %zmm11 | |
vpermt2ps %zmm17, %zmm10, %zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm10 {%k7} {z} | |
movq 544(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm3) + zmm13 | |
vmovaps .LCPI0_66(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,28,u,u,u,u] | |
vfmadd231ps %zmm4, %zmm11, %zmm29 # zmm29 = (zmm11 * zmm4) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm11, %zmm21 # zmm21 = (zmm11 * zmm7) + zmm21 | |
vfmadd231ps %zmm2, %zmm11, %zmm22 # zmm22 = (zmm11 * zmm2) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm7) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $2, %zmm0, %xmm7 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm11, %zmm19 # zmm19 = (zmm11 * zmm2) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $3, %zmm0, %xmm2 | |
vbroadcastss %xmm10, %zmm4 | |
vmovups %zmm29, 96(%rsp) # 64-byte Spill | |
vbroadcastss %xmm7, %zmm7 | |
vbroadcastss %xmm2, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm11, %zmm8 # zmm8 = (zmm11 * zmm7) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm0, %xmm0, %xmm7 # xmm7 = xmm0[2,2,2,2] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm11, %zmm27 # zmm27 = (zmm11 * zmm2) + zmm27 | |
vmovaps %zmm16, %zmm29 | |
vmovups 32(%rsp), %zmm16 # 64-byte Reload | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm31, %zmm3, %zmm5 | |
vmovaps .LCPI0_87(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,30,u,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm7, %zmm7 | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm11, %zmm24 # zmm24 = (zmm11 * zmm7) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm0, %xmm7 # xmm7 = xmm0[1,1,3,3] | |
vbroadcastsd %xmm7, %zmm7 | |
vmovups %zmm27, 352(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_172(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,7,8,9,10,26,u,u,u,u] | |
vmovups %zmm8, 944(%rsp) # 64-byte Spill | |
vmovapd %zmm28, %zmm8 | |
vfmadd231ps %zmm7, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm7) + zmm15 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm0, %zmm7 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm7, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm7) + zmm14 | |
vmovaps .LCPI0_76(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,29,u,u,u,u] | |
vfmadd231ps %zmm2, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm2) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm31, %zmm3, %zmm18 | |
vmovaps .LCPI0_99(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,31,u,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vmovaps %zmm15, %zmm26 | |
vmovups (%rsi,%rax), %zmm15 {%k6} {z} | |
movq -16(%rsp), %rax # 8-byte Reload | |
vmovaps %zmm14, %zmm30 | |
vmovups 160(%rsp), %zmm14 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm31, %zmm7, %zmm1 | |
vmovaps %zmm31, %zmm7 | |
vpermt2ps %zmm7, %zmm27, %zmm6 | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovaps %zmm7, %zmm9 {%k1} | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovapd .LCPI0_173(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,13,u] | |
vmovapd .LCPI0_175(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,13] | |
vpermt2ps %zmm31, %zmm3, %zmm23 | |
vmovaps .LCPI0_100(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,31,u,u,u] | |
vmovaps %zmm20, %zmm31 | |
vmovups 224(%rsp), %zmm20 # 64-byte Reload | |
vfmadd231ps %zmm2, %zmm11, %zmm31 # zmm31 = (zmm11 * zmm2) + zmm31 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %xmm0, %xmm0, %xmm2 # xmm2 = xmm0[3,3,3,3] | |
vbroadcastsd %xmm2, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2pd %zmm8, %zmm7, %zmm6 | |
vmovaps .LCPI0_174(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,26,u,u] | |
vpermt2ps %zmm28, %zmm3, %zmm23 | |
vmovups 288(%rsp), %zmm28 # 64-byte Reload | |
vmovaps .LCPI0_101(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,31,u,u] | |
vfmadd231ps %zmm2, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm2) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf128 $1, %ymm0, %xmm2 | |
vshufps $170, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm2, %zmm2 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm0) + zmm14 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps %zmm12, %zmm2, %zmm11 # zmm11 = (zmm2 * zmm11) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm10, %xmm10, %xmm12 # xmm12 = xmm10[2,2,2,2] | |
vshufps $170, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm7, %zmm6 | |
vmovaps .LCPI0_102(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,31,u] | |
vpermt2ps %zmm28, %zmm3, %zmm23 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2pd %zmm25, %zmm27, %zmm6 | |
vmovaps %zmm17, %zmm27 | |
vpermt2ps %zmm25, %zmm7, %zmm23 | |
vmovaps .LCPI0_176(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,26] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm25 # zmm25 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm17, %zmm7, %zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm10, %xmm7 # xmm7 = xmm10[1,1,3,3] | |
vmovups 944(%rsp), %zmm17 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm30 # zmm30 = (zmm6 * zmm4) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm7, %zmm4 | |
vextractf32x4 $2, %zmm10, %xmm7 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm25, %zmm6, %zmm29 # zmm29 = (zmm6 * zmm25) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm25 # zmm25 = zmm0[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm6, %zmm19 # zmm19 = (zmm6 * zmm3) + zmm19 | |
vmovapd .LCPI0_88(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,15,u] | |
vfmadd231ps %zmm4, %zmm6, %zmm26 # zmm26 = (zmm6 * zmm4) + zmm26 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm12, %zmm4 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm12 # zmm12 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm24 # zmm24 = (zmm6 * zmm4) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm7, %zmm4 | |
vshufps $255, %xmm10, %xmm10, %xmm7 # xmm7 = xmm10[3,3,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm12, %zmm6, %zmm21 # zmm21 = (zmm6 * zmm12) + zmm21 | |
vmovups 96(%rsp), %zmm12 # 64-byte Reload | |
vfmadd231ps %zmm2, %zmm6, %zmm22 # zmm22 = (zmm6 * zmm2) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm17 # zmm17 = (zmm6 * zmm4) + zmm17 | |
vmovups %zmm24, 32(%rsp) # 64-byte Spill | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $1, %ymm10, %xmm24 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2pd %zmm8, %zmm3, %zmm18 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm15, %zmm15, %zmm3 # zmm3 = zmm15[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vbroadcastss %xmm24, %zmm4 | |
vmovups 416(%rsp), %zmm24 # 64-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm11 # zmm11 = (zmm6 * zmm4) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm25, %zmm6, %zmm12 # zmm12 = (zmm6 * zmm25) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm14 # zmm14 = (zmm6 * zmm4) + zmm14 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm7, %zmm4 | |
vmovups (%rsi,%rax), %zmm7 {%k5} {z} | |
movw $-32768, %ax # imm = 0x8000 | |
kmovd %eax, %k1 | |
movq 552(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm20 # zmm20 = (zmm6 * zmm4) + zmm20 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovaps %zmm27, %zmm23 {%k1} | |
kmovw 880(%rsp), %k1 # 2-byte Reload | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
vmovups %zmm14, 160(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
vmovups (%rsi,%rax), %zmm14 {%k2} {z} | |
movq -8(%rsp), %rax # 8-byte Reload | |
vmovaps %zmm20, %zmm25 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_177(%rip), %zmm20 # zmm20 = [0,1,2,3,4,5,6,7,8,9,10,11,27,u,u,u] | |
vfmadd231ps %zmm4, %zmm6, %zmm16 # zmm16 = (zmm6 * zmm4) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $3, %zmm10, %xmm4 | |
vmovups %zmm31, 1072(%rsp) # 64-byte Spill | |
vmovups 352(%rsp), %zmm31 # 64-byte Reload | |
vbroadcastss %xmm4, %zmm4 | |
vmovups (%rsi,%rax), %zmm10 {%k1} {z} | |
kmovw -120(%rsp), %k1 # 2-byte Reload | |
movq 24(%rsp), %rax # 8-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm8, %zmm20, %zmm9 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm15, %zmm20 | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovapd %zmm8, %zmm5 {%k1} | |
kmovw 1008(%rsp), %k1 # 2-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31 | |
vfmadd213ps %zmm13, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm6) + zmm13 | |
vmovaps .LCPI0_77(%rip), %zmm13 # zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,29,u,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vshufps $170, %zmm15, %zmm15, %zmm0 # zmm0 = zmm15[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm8, %zmm13, %zmm1 | |
vmovaps .LCPI0_178(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,27,u,u] | |
vmovaps %zmm30, %zmm13 | |
vmovups 1072(%rsp), %zmm30 # 64-byte Reload | |
vpermt2ps %zmm28, %zmm8, %zmm9 | |
vmovaps .LCPI0_179(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,27,u] | |
vpermt2ps %zmm24, %zmm8, %zmm9 | |
vmovaps .LCPI0_180(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,27] | |
vpermt2ps %zmm27, %zmm8, %zmm9 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm15, %xmm8 # xmm8 = xmm15[1,1,3,3] | |
vmovaps %zmm19, %zmm27 | |
vmovaps %zmm24, %zmm19 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm9, %zmm12 # zmm12 = (zmm9 * zmm4) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm20, %zmm9, %zmm13 # zmm13 = (zmm9 * zmm20) + zmm13 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovups (%rsi,%rax), %zmm20 {%k1} {z} | |
kmovw -122(%rsp), %k1 # 2-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm9, %zmm6 # zmm6 = (zmm9 * zmm3) + zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm7, %zmm7, %zmm3 # zmm3 = zmm7[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
addq 504(%rsp), %rsi # 8-byte Folded Reload | |
decl %ebx | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm9, %zmm29 # zmm29 = (zmm9 * zmm4) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $2, %zmm15, %xmm4 | |
vmovups %zmm12, 96(%rsp) # 64-byte Spill | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm12 # zmm12 = zmm0[4,5,4,5,4,5,4,5] | |
vbroadcastss %xmm4, %zmm4 | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovaps %zmm28, %zmm1 {%k1} | |
kmovw -124(%rsp), %k1 # 2-byte Reload | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm12, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm12) + zmm21 | |
vfmadd231ps %zmm4, %zmm9, %zmm17 # zmm17 = (zmm9 * zmm4) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm15, %xmm15, %xmm4 # xmm4 = xmm15[2,2,2,2] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm9, %zmm22 # zmm22 = (zmm9 * zmm0) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_89(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,30,u,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm4, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm9, %zmm27 # zmm27 = (zmm9 * zmm0) + zmm27 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $3, %zmm15, %xmm0 | |
vmovaps %zmm21, %zmm12 | |
vmovups 32(%rsp), %zmm21 # 64-byte Reload | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm9, %zmm31 # zmm31 = (zmm9 * zmm0) + zmm31 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[3,3,3,3,7,7,7,7] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm2, %zmm18 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm15, %ymm15, %ymm2 # ymm2 = ymm15[2,2,2,2,6,6,6,6] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm9, %zmm16 # zmm16 = (zmm9 * zmm0) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm4) + zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm8, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_181(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,28,u,u] | |
vfmadd231ps %zmm0, %zmm9, %zmm30 # zmm30 = (zmm9 * zmm0) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %xmm15, %xmm15, %xmm0 # xmm0 = xmm15[3,3,3,3] | |
vbroadcastsd %xmm0, %zmm0 | |
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20 | |
vmovapd %zmm19, %zmm18 {%k1} | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm9, %zmm26 # zmm26 = (zmm9 * zmm4) + zmm26 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm7, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm9, %zmm25 # zmm25 = (zmm9 * zmm0) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf128 $1, %ymm15, %xmm0 | |
vmovups 160(%rsp), %zmm15 # 64-byte Reload | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm8, %zmm5 | |
vmovapd .LCPI0_182(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,14] | |
vmovups 560(%rsp), %zmm28 # 64-byte Reload | |
vfmadd231ps %zmm2, %zmm9, %zmm15 # zmm15 = (zmm9 * zmm2) + zmm15 | |
vfmadd213ps %zmm11, %zmm0, %zmm9 # zmm9 = (zmm0 * zmm9) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm7, %xmm11 # xmm11 = xmm7[1,1,3,3] | |
vshufps $170, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vshufps $255, %zmm7, %zmm7, %zmm2 # zmm2 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2pd %zmm24, %zmm8, %zmm5 | |
vmovaps .LCPI0_183(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,28] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $1, %ymm7, %xmm24 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm8, %zmm5 | |
vmovaps %zmm30, %zmm8 | |
vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm11, %zmm4 | |
vextractf32x4 $2, %zmm7, %xmm11 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm26 # zmm26 = (zmm5 * zmm4) + zmm26 | |
vmovaps %zmm13, %zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm7, %xmm7, %xmm13 # xmm13 = xmm7[2,2,2,2] | |
vbroadcastsd %xmm13, %zmm4 | |
vmovaps %zmm21, %zmm13 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm21 # zmm21 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm11, %zmm4 | |
vmovaps %zmm17, %zmm11 | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm17 # zmm17 = zmm0[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm21, %zmm5, %zmm29 # zmm29 = (zmm5 * zmm21) + zmm29 | |
vmovaps %zmm12, %zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %xmm7, %xmm7, %xmm12 # xmm12 = xmm7[3,3,3,3] | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm17, %zmm5, %zmm21 # zmm21 = (zmm5 * zmm17) + zmm21 | |
vmovups 96(%rsp), %zmm17 # 64-byte Reload | |
vfmadd231ps %zmm4, %zmm5, %zmm11 # zmm11 = (zmm5 * zmm4) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm5, %zmm22 # zmm22 = (zmm5 * zmm0) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm14, %zmm14, %zmm0 # zmm0 = zmm14[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm17 # zmm17 = (zmm5 * zmm4) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm24, %zmm4 | |
vmovaps %zmm31, %zmm24 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm9 # zmm9 = (zmm5 * zmm4) + zmm9 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm15 # zmm15 = (zmm5 * zmm4) + zmm15 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm12, %zmm4 | |
vmovaps %zmm27, %zmm12 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm5, %zmm12 # zmm12 = (zmm5 * zmm3) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm27 # zmm27 = zmm0[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
vshufps $255, %zmm14, %zmm14, %zmm3 # zmm3 = zmm14[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm25 # zmm25 = (zmm5 * zmm4) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
vmovups %zmm15, 160(%rsp) # 64-byte Spill | |
vmovaps %zmm28, %zmm15 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm8 # zmm8 = (zmm5 * zmm4) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[3,3,3,3,7,7,7,7] | |
vextractf32x4 $3, %zmm7, %xmm7 | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm16 # zmm16 = (zmm5 * zmm4) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm7, %zmm4 | |
vmovaps %zmm13, %zmm7 | |
vextractf32x4 $2, %zmm14, %xmm13 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm5, %zmm24 # zmm24 = (zmm5 * zmm4) + zmm24 | |
vfmadd213ps %zmm6, %zmm2, %zmm5 # zmm5 = (zmm2 * zmm5) + zmm6 | |
vmovaps .LCPI0_184(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,29,u] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
vshufps $170, %zmm14, %zmm14, %zmm2 # zmm2 = zmm14[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm19, %zmm6, %zmm1 | |
vmovaps .LCPI0_185(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,29] | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm14, %xmm14, %xmm19 # xmm19 = xmm14[2,2,2,2] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm28, %zmm6, %zmm1 | |
vmovups 160(%rsp), %zmm28 # 64-byte Reload | |
vmovaps %zmm26, %zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm26 # zmm26 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm1, %zmm12 # zmm12 = (zmm1 * zmm0) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $3, %zmm14, %xmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm1, %zmm17 # zmm17 = (zmm1 * zmm4) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm13, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm26, %zmm1, %zmm21 # zmm21 = (zmm1 * zmm26) + zmm21 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm14, %zmm26 | |
vmovshdup %xmm14, %xmm13 # xmm13 = xmm14[1,1,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm1, %zmm22 # zmm22 = (zmm1 * zmm2) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm14, %ymm14, %ymm2 # ymm2 = ymm14[2,2,2,2,6,6,6,6] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm1, %zmm5 # zmm5 = (zmm1 * zmm3) + zmm5 | |
vfmadd231ps %zmm27, %zmm1, %zmm29 # zmm29 = (zmm1 * zmm27) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm1, %zmm11 # zmm11 = (zmm1 * zmm4) + zmm11 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm19, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vmovaps .LCPI0_186(%rip), %zmm19 # zmm19 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,30] | |
vfmadd231ps %zmm26, %zmm1, %zmm30 # zmm30 = (zmm1 * zmm26) + zmm30 | |
vfmadd231ps %zmm2, %zmm1, %zmm28 # zmm28 = (zmm1 * zmm2) + zmm28 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm1, %zmm24 # zmm24 = (zmm1 * zmm0) + zmm24 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[3,3,3,3,7,7,7,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm1, %zmm7 # zmm7 = (zmm1 * zmm4) + zmm7 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm13, %zmm4 | |
vshufps $170, %xmm10, %xmm10, %xmm13 # xmm13 = xmm10[2,2,2,2] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm1, %zmm6 # zmm6 = (zmm1 * zmm4) + zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm10, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm1, %zmm16 # zmm16 = (zmm1 * zmm0) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm1, %zmm8 # zmm8 = (zmm1 * zmm0) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %xmm14, %xmm14, %xmm0 # xmm0 = xmm14[3,3,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vpermt2ps %zmm15, %zmm19, %zmm18 | |
vmovaps %zmm12, %zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm1, %zmm25 # zmm25 = (zmm1 * zmm0) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf128 $1, %ymm14, %xmm0 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm14 # zmm14 = zmm3[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7] | |
vbroadcastss %xmm0, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm4) + zmm30 | |
vfmadd231ps %zmm14, %zmm18, %zmm29 # zmm29 = (zmm18 * zmm14) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm2, %zmm2, %zmm14 # zmm14 = zmm2[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm3, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm3) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %zmm20, %zmm20, %zmm3 # zmm3 = zmm20[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd213ps %zmm9, %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + zmm9 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm10, %xmm9 # xmm9 = xmm10[1,1,3,3] | |
vshufps $170, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm14, %zmm18, %zmm17 # zmm17 = (zmm18 * zmm14) + zmm17 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm9, %zmm4 | |
vextractf32x4 $2, %zmm10, %xmm9 | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm6 # zmm6 = (zmm18 * zmm4) + zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm13, %zmm4 | |
vextractf128 $1, %ymm10, %xmm13 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm0) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm15, %zmm18, %zmm21 # zmm21 = (zmm18 * zmm15) + zmm21 | |
vmovaps %zmm17, %zmm15 | |
vfmadd231ps %zmm4, %zmm18, %zmm7 # zmm7 = (zmm18 * zmm4) + zmm7 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm9, %zmm4 | |
vmovaps %zmm11, %zmm9 | |
vshufps $255, %xmm10, %xmm10, %xmm11 # xmm11 = xmm10[3,3,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm9 # zmm9 = (zmm18 * zmm4) + zmm9 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm13, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm1 # zmm1 = (zmm18 * zmm4) + zmm1 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
vmovaps %zmm9, %zmm12 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm28 # zmm28 = (zmm18 * zmm4) + zmm28 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm11, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm25 # zmm25 = (zmm18 * zmm4) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5] | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm8 # zmm8 = (zmm18 * zmm4) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7] | |
vextractf32x4 $3, %zmm10, %xmm10 | |
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm16 # zmm16 = (zmm18 * zmm4) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm10, %zmm4 | |
vshufps $255, %zmm20, %zmm20, %zmm10 # zmm10 = zmm20[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm18, %zmm24 # zmm24 = (zmm18 * zmm4) + zmm24 | |
vfmadd213ps %zmm5, %zmm2, %zmm18 # zmm18 = (zmm2 * zmm18) + zmm5 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm0, %zmm0, %zmm5 # zmm5 = zmm0[4,5,4,5,4,5,4,5] | |
vshuff64x2 $170, %zmm10, %zmm10, %zmm4 # zmm4 = zmm10[4,5,4,5,4,5,4,5] | |
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7] | |
vshuff64x2 $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[6,7,6,7,6,7,6,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm23, %zmm15 # zmm15 = (zmm23 * zmm4) + zmm15 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm22 # zmm22 = (zmm23 * zmm0) + zmm22 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7] | |
vshufps $255, %xmm20, %xmm20, %xmm3 # xmm3 = xmm20[3,3,3,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm5, %zmm23, %zmm21 # zmm21 = (zmm23 * zmm5) + zmm21 | |
vfmadd231ps %zmm4, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm4) + zmm29 | |
vfmadd231ps %zmm0, %zmm23, %zmm19 # zmm19 = (zmm23 * zmm0) + zmm19 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $3, %zmm20, %xmm0 | |
vextractf32x4 $2, %zmm20, %xmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm2, %zmm23, %zmm18 # zmm18 = (zmm23 * zmm2) + zmm18 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vextractf32x4 $1, %ymm20, %xmm2 | |
vbroadcastss %xmm0, %zmm0 | |
vbroadcastss %xmm4, %zmm4 | |
vmovaps %zmm29, %zmm9 | |
vmovaps %zmm24, %zmm29 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm0) + zmm29 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $255, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[3,3,3,3,7,7,7,7] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm23, %zmm12 # zmm12 = (zmm23 * zmm4) + zmm12 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %xmm20, %xmm20, %xmm4 # xmm4 = xmm20[2,2,2,2] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
vbroadcastsd %xmm4, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm16 # zmm16 = (zmm23 * zmm0) + zmm16 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $85, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[1,1,1,1,5,5,5,5] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm23, %zmm7 # zmm7 = (zmm23 * zmm4) + zmm7 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vmovshdup %xmm20, %xmm4 # xmm4 = xmm20[1,1,3,3] | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
vbroadcastsd %xmm4, %zmm4 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm8 # zmm8 = (zmm23 * zmm0) + zmm8 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastsd %xmm3, %zmm0 | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm23, %zmm6 # zmm6 = (zmm23 * zmm4) + zmm6 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm20, %zmm4 | |
vmovups %zmm7, 32(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm25 # zmm25 = (zmm23 * zmm0) + zmm25 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshufps $170, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[2,2,2,2,6,6,6,6] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm4, %zmm23, %zmm30 # zmm30 = (zmm23 * zmm4) + zmm30 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm28 # zmm28 = (zmm23 * zmm0) + zmm28 | |
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20 | |
vbroadcastss %xmm2, %zmm0 | |
vmovups %zmm25, 224(%rsp) # 64-byte Spill | |
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35 | |
vfmadd231ps %zmm0, %zmm23, %zmm1 # zmm1 = (zmm23 * zmm0) + zmm1 | |
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22 | |
jne .LBB0_3 | |
jmp .LBB0_4 | |
.LBB0_1: | |
.loc 1 0 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:22 | |
vpxor %xmm0, %xmm0, %xmm0 | |
vxorpd %xmm30, %xmm30, %xmm30 | |
vxorps %xmm6, %xmm6, %xmm6 | |
vpxor %xmm1, %xmm1, %xmm1 | |
vxorps %xmm8, %xmm8, %xmm8 | |
vpxord %xmm28, %xmm28, %xmm28 | |
vxorps %xmm16, %xmm16, %xmm16 | |
vxorps %xmm12, %xmm12, %xmm12 | |
vpxor %xmm9, %xmm9, %xmm9 | |
vpxord %xmm21, %xmm21, %xmm21 | |
vpxor %xmm15, %xmm15, %xmm15 | |
vpxord %xmm29, %xmm29, %xmm29 | |
vxorps %xmm19, %xmm19, %xmm19 | |
vpxord %xmm22, %xmm22, %xmm22 | |
vxorps %xmm18, %xmm18, %xmm18 | |
vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill | |
vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill | |
.LBB0_4: # %._crit_edge | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vbroadcasti32x4 .LCPI0_21(%rip), %zmm5 # zmm5 = [0,8,0,8,0,8,0,8] | |
# zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
vmovdqa64 %zmm9, %zmm4 | |
vpunpckldq %xmm4, %xmm12, %xmm24 # xmm24 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] | |
vmovdqa64 %zmm21, %zmm9 | |
vmovdqa64 %zmm15, %zmm20 | |
vpunpckldq %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[0],xmm20[0],xmm9[1],xmm20[1] | |
vmovdqa64 %zmm29, %zmm26 | |
vpunpckldq %xmm19, %xmm26, %xmm13 # xmm13 = xmm26[0],xmm19[0],xmm26[1],xmm19[1] | |
vinsertps $76, %xmm12, %xmm4, %xmm14 # xmm14 = xmm12[1],xmm4[1],zero,zero | |
vinsertps $76, %xmm26, %xmm19, %xmm2 # xmm2 = xmm26[1],xmm19[1],zero,zero | |
vmovdqa64 %zmm21, %zmm3 | |
movb $-64, %al | |
vmovaps %zmm12, %zmm23 | |
vmovsd .LCPI0_195(%rip), %xmm17 # xmm17 = [3,7,0,0] | |
vinsertps $76, %xmm30, %xmm6, %xmm11 # xmm11 = xmm30[1],xmm6[1],zero,zero | |
vunpckhps %xmm6, %xmm30, %xmm27 # xmm27 = xmm30[2],xmm6[2],xmm30[3],xmm6[3] | |
vinsertps $76, %xmm1, %xmm8, %xmm7 # xmm7 = xmm1[1],xmm8[1],zero,zero | |
vmovdqa64 %zmm22, %zmm25 | |
movq 480(%rsp), %r14 # 8-byte Reload | |
movl -108(%rsp), %r12d # 4-byte Reload | |
movq $-1, %r15 | |
movq 496(%rsp), %r13 # 8-byte Reload | |
kmovd %eax, %k1 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
movl 3520(%rsp), %eax | |
vpbroadcastd %eax, %xmm10 | |
movl -112(%rsp), %eax # 4-byte Reload | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpermt2pd %zmm0, %zmm5, %zmm24 | |
vpunpckldq %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[0],xmm18[0],xmm22[1],xmm18[1] | |
vpermi2ps %xmm8, %xmm1, %xmm17 | |
vpermt2pd %zmm0, %zmm5, %zmm13 | |
vinsertps $76, %xmm9, %xmm20, %xmm0 # xmm0 = xmm9[1],xmm20[1],zero,zero | |
vpermt2pd %zmm0, %zmm5, %zmm14 | |
vinsertps $76, %xmm22, %xmm18, %xmm0 # xmm0 = xmm22[1],xmm18[1],zero,zero | |
vpermt2pd %zmm0, %zmm5, %zmm2 | |
vunpckhps %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpbroadcastd %eax, %zmm31 | |
vmovupd %zmm2, 96(%rsp) # 64-byte Spill | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vunpckhps %xmm4, %xmm12, %xmm2 # xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] | |
vpermt2pd %zmm0, %zmm5, %zmm2 | |
vunpckhps %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[2],xmm18[2],xmm22[3],xmm18[3] | |
vmovupd %zmm2, 160(%rsp) # 64-byte Spill | |
vunpckhps %xmm19, %xmm26, %xmm2 # xmm2 = xmm26[2],xmm19[2],xmm26[3],xmm19[3] | |
vpermt2pd %zmm0, %zmm5, %zmm2 | |
vbroadcastsd .LCPI0_32(%rip), %zmm0 # zmm0 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19] | |
vshufps $51, %xmm30, %xmm6, %xmm5 # xmm5 = xmm6[3,0],xmm30[3,0] | |
vmovaps %xmm5, 2096(%rsp) # 16-byte Spill | |
vunpcklps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[0],ymm6[0],ymm30[1],ymm6[1],ymm30[4],ymm6[4],ymm30[5],ymm6[5] | |
vmovups %ymm5, 2224(%rsp) # 32-byte Spill | |
vunpckhps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[2],ymm6[2],ymm30[3],ymm6[3],ymm30[6],ymm6[6],ymm30[7],ymm6[7] | |
vmovups %ymm5, 1456(%rsp) # 32-byte Spill | |
vunpcklps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[0],zmm6[0],zmm30[1],zmm6[1],zmm30[4],zmm6[4],zmm30[5],zmm6[5],zmm30[8],zmm6[8],zmm30[9],zmm6[9],zmm30[12],zmm6[12],zmm30[13],zmm6[13] | |
vmovups %zmm5, 880(%rsp) # 64-byte Spill | |
vunpckhps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[2],zmm6[2],zmm30[3],zmm6[3],zmm30[6],zmm6[6],zmm30[7],zmm6[7],zmm30[10],zmm6[10],zmm30[11],zmm6[11],zmm30[14],zmm6[14],zmm30[15],zmm6[15] | |
vmovupd %zmm2, 288(%rsp) # 64-byte Spill | |
vmovaps %zmm12, %zmm2 | |
vmovups %zmm5, 1200(%rsp) # 64-byte Spill | |
vbroadcastsd .LCPI0_30(%rip), %ymm5 # ymm5 = [5,13,5,13,5,13,5,13] | |
vpermt2ps %zmm4, %zmm0, %zmm2 | |
vpermt2ps %zmm15, %zmm0, %zmm3 | |
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
vmovdqa64 %zmm29, %zmm2 | |
vpermt2ps %zmm19, %zmm0, %zmm2 | |
vpermi2ps %zmm18, %zmm22, %zmm0 | |
vpermi2ps %ymm8, %ymm1, %ymm5 | |
vmovups %ymm5, 2160(%rsp) # 32-byte Spill | |
vbroadcastsd .LCPI0_13(%rip), %ymm5 # ymm5 = [7,15,7,15,7,15,7,15] | |
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
vbroadcasti32x4 .LCPI0_25(%rip), %zmm0 # zmm0 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20] | |
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
vunpcklps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] | |
vmovupd %zmm3, 416(%rsp) # 64-byte Spill | |
vpermi2ps %ymm8, %ymm1, %ymm5 | |
vpermi2ps %zmm15, %zmm21, %zmm0 | |
vmovups %ymm5, 1520(%rsp) # 32-byte Spill | |
vunpcklps %xmm8, %xmm1, %xmm5 # xmm5 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6] | |
vbroadcastf128 .LCPI0_196(%rip), %ymm0 # ymm0 = [0,1,4,12,0,1,4,12] | |
# ymm0 = mem[0,1,0,1] | |
vunpcklps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5] | |
vpermi2ps %ymm18, %ymm22, %ymm0 | |
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
vmovaps .LCPI0_188(%rip), %ymm0 # ymm0 = [1,9,u,u,5,13,u,u] | |
vbroadcasti32x4 .LCPI0_28(%rip), %zmm2 # zmm2 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7] | |
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
vmovupd %zmm3, 560(%rsp) # 64-byte Spill | |
vunpcklps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[0],ymm20[0],ymm9[1],ymm20[1],ymm9[4],ymm20[4],ymm9[5],ymm20[5] | |
vpermi2ps %ymm19, %ymm26, %ymm0 | |
vpermi2ps %zmm4, %zmm12, %zmm2 | |
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
vunpcklps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5] | |
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7] | |
vbroadcasti32x4 .LCPI0_23(%rip), %zmm0 # zmm0 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22] | |
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
vunpckhps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] | |
vmovupd %zmm3, 688(%rsp) # 64-byte Spill | |
vpermi2ps %zmm15, %zmm21, %zmm0 | |
vmovaps .LCPI0_19(%rip), %zmm15 # zmm15 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28] | |
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6] | |
vbroadcastf128 .LCPI0_197(%rip), %ymm0 # ymm0 = [0,1,6,14,0,1,6,14] | |
# ymm0 = mem[0,1,0,1] | |
vunpckhps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7] | |
vpermi2ps %ymm18, %ymm22, %ymm0 | |
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
vmovaps .LCPI0_191(%rip), %ymm0 # ymm0 = [3,11,u,u,7,15,u,u] | |
vbroadcasti32x4 .LCPI0_11(%rip), %zmm2 # zmm2 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7] | |
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] | |
vmovupd %zmm3, 624(%rsp) # 64-byte Spill | |
vunpckhps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[2],ymm20[2],ymm9[3],ymm20[3],ymm9[6],ymm20[6],ymm9[7],ymm20[7] | |
vpermi2ps %ymm19, %ymm26, %ymm0 | |
vpermi2ps %zmm4, %zmm12, %zmm2 | |
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 | |
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3] | |
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6] | |
vunpckhps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7] | |
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2 | |
vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7] | |
vmovaps .LCPI0_18(%rip), %zmm0 # zmm0 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15] | |
vmovdqa64 %zmm29, %zmm2 | |
vmovupd %zmm3, 1072(%rsp) # 64-byte Spill | |
vunpcklps %zmm4, %zmm12, %zmm3 # zmm3 = zmm12[0],zmm4[0],zmm12[1],zmm4[1],zmm12[4],zmm4[4],zmm12[5],zmm4[5],zmm12[8],zmm4[8],zmm12[9],zmm4[9],zmm12[12],zmm4[12],zmm12[13],zmm4[13] | |
vpermt2ps %zmm19, %zmm0, %zmm2 | |
vmovups %zmm2, 352(%rsp) # 64-byte Spill | |
vmovaps %zmm0, %zmm2 | |
vpunpckldq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[0],zmm19[0],zmm29[1],zmm19[1],zmm29[4],zmm19[4],zmm29[5],zmm19[5],zmm29[8],zmm19[8],zmm29[9],zmm19[9],zmm29[12],zmm19[12],zmm29[13],zmm19[13] | |
vpermt2ps %zmm4, %zmm2, %zmm23 | |
vmovdqu64 %zmm0, 1264(%rsp) # 64-byte Spill | |
vpunpckhdq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[2],zmm19[2],zmm29[3],zmm19[3],zmm29[6],zmm19[6],zmm29[7],zmm19[7],zmm29[10],zmm19[10],zmm29[11],zmm19[11],zmm29[14],zmm19[14],zmm29[15],zmm19[15] | |
vunpckhps %zmm4, %zmm12, %zmm29 # zmm29 = zmm12[2],zmm4[2],zmm12[3],zmm4[3],zmm12[6],zmm4[6],zmm12[7],zmm4[7],zmm12[10],zmm4[10],zmm12[11],zmm4[11],zmm12[14],zmm4[14],zmm12[15],zmm4[15] | |
vmovdqu64 %zmm0, 1328(%rsp) # 64-byte Spill | |
vmovaps .LCPI0_15(%rip), %zmm0 # zmm0 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15] | |
vpermt2ps %zmm4, %zmm0, %zmm12 | |
vbroadcasti128 .LCPI0_29(%rip), %ymm4 # ymm4 = [5,13,6,7,5,13,6,7] | |
# ymm4 = mem[0,1,0,1] | |
vpermt2ps %zmm19, %zmm0, %zmm26 | |
vpermi2ps %ymm6, %ymm30, %ymm4 | |
vmovups %ymm4, 2288(%rsp) # 32-byte Spill | |
vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7] | |
# ymm4 = mem[0,1,0,1] | |
vpermi2ps %ymm6, %ymm30, %ymm4 | |
vmovups %ymm4, 1712(%rsp) # 32-byte Spill | |
vmovaps %zmm30, %zmm4 | |
vpermt2ps %zmm6, %zmm2, %zmm4 | |
vpermi2ps %zmm8, %zmm1, %zmm2 | |
vmovups %zmm2, 1904(%rsp) # 64-byte Spill | |
vunpckhps %xmm8, %xmm1, %xmm2 # xmm2 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] | |
vmovups %zmm4, 1968(%rsp) # 64-byte Spill | |
vunpcklps %xmm6, %xmm30, %xmm4 # xmm4 = xmm30[0],xmm6[0],xmm30[1],xmm6[1] | |
vpermt2ps %zmm6, %zmm0, %zmm30 | |
vmovaps %zmm9, %zmm6 | |
vmovaps %xmm2, 2032(%rsp) # 16-byte Spill | |
vunpcklps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] | |
vmovups %ymm2, 784(%rsp) # 32-byte Spill | |
vunpckhps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] | |
vmovups %ymm2, 752(%rsp) # 32-byte Spill | |
vunpcklps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[0],zmm8[0],zmm1[1],zmm8[1],zmm1[4],zmm8[4],zmm1[5],zmm8[5],zmm1[8],zmm8[8],zmm1[9],zmm8[9],zmm1[12],zmm8[12],zmm1[13],zmm8[13] | |
vmovups %zmm2, 1840(%rsp) # 64-byte Spill | |
vunpckhps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[2],zmm8[2],zmm1[3],zmm8[3],zmm1[6],zmm8[6],zmm1[7],zmm8[7],zmm1[10],zmm8[10],zmm1[11],zmm8[11],zmm1[14],zmm8[14],zmm1[15],zmm8[15] | |
vpermt2ps %zmm8, %zmm0, %zmm1 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm31, %xmm0 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vunpcklps %zmm20, %zmm9, %zmm8 # zmm8 = zmm9[0],zmm20[0],zmm9[1],zmm20[1],zmm9[4],zmm20[4],zmm9[5],zmm20[5],zmm9[8],zmm20[8],zmm9[9],zmm20[9],zmm9[12],zmm20[12],zmm9[13],zmm20[13] | |
vmovups %zmm2, 1136(%rsp) # 64-byte Spill | |
vmovaps %zmm21, %zmm2 | |
vpermt2ps %zmm20, %zmm15, %zmm2 | |
vmovaps .LCPI0_17(%rip), %zmm21 # zmm21 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30] | |
vmovups %zmm30, 1008(%rsp) # 64-byte Spill | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm0, %eax | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vunpckhps %zmm20, %zmm9, %zmm0 # zmm0 = zmm9[2],zmm20[2],zmm9[3],zmm20[3],zmm9[6],zmm20[6],zmm9[7],zmm20[7],zmm9[10],zmm20[10],zmm9[11],zmm20[11],zmm9[14],zmm20[14],zmm9[15],zmm20[15] | |
vmovaps %zmm3, %zmm9 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
cltq | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vshufpd $32, %zmm2, %zmm3, %zmm30 # zmm30 = zmm3[0],zmm2[0],zmm3[2],zmm2[2],zmm3[4],zmm2[5],zmm3[6],zmm2[6] | |
vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14] | |
vpermt2ps %zmm20, %zmm21, %zmm6 | |
vpermt2pd %zmm2, %zmm3, %zmm9 | |
vshufpd $32, %zmm8, %zmm23, %zmm2 # zmm2 = zmm23[0],zmm8[0],zmm23[2],zmm8[2],zmm23[4],zmm8[5],zmm23[6],zmm8[6] | |
vpermt2pd %zmm8, %zmm3, %zmm23 | |
vmovupd %zmm2, 1584(%rsp) # 64-byte Spill | |
vshufpd $32, %zmm6, %zmm29, %zmm2 # zmm2 = zmm29[0],zmm6[0],zmm29[2],zmm6[2],zmm29[4],zmm6[5],zmm29[6],zmm6[6] | |
vpermt2pd %zmm6, %zmm3, %zmm29 | |
vmovupd %zmm2, 1648(%rsp) # 64-byte Spill | |
vshufpd $32, %zmm0, %zmm12, %zmm2 # zmm2 = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[5],zmm12[6],zmm0[6] | |
vpermt2pd %zmm0, %zmm3, %zmm12 | |
vmovdqu64 3248(%rsp), %zmm3 # 64-byte Reload | |
vmovupd %zmm2, 1776(%rsp) # 64-byte Spill | |
vmovapd %zmm24, %zmm2 | |
vmovups 224(%rsp), %zmm24 # 64-byte Reload | |
vmovapd %zmm13, %zmm2 {%k1} | |
vmovupd %zmm9, 2352(%rsp) # 64-byte Spill | |
vmovupd %zmm29, 816(%rsp) # 64-byte Spill | |
vmovupd %zmm12, 944(%rsp) # 64-byte Spill | |
vmovups 32(%rsp), %zmm12 # 64-byte Reload | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpshufd $85, %xmm3, %xmm0 # xmm0 = xmm3[1,1,1,1] | |
vextracti128 $1, %ymm3, %xmm6 | |
vpshufd $85, %zmm3, %zmm20 # zmm20 = zmm3[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] | |
vpbroadcastq %xmm0, %zmm9 | |
vpbroadcastd %xmm6, %zmm22 | |
vpmulld %xmm10, %xmm6, %xmm6 | |
vpmulld %xmm10, %xmm9, %xmm0 | |
vpextrd $3, %xmm6, %edi | |
vpextrd $3, %xmm0, %ebp | |
vpshufd $250, %xmm3, %xmm0 # xmm0 = xmm3[2,2,3,3] | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %edi, %rdi | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpbroadcastq %xmm0, %zmm8 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rdi,4), %rdi | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm8, %xmm0 | |
vpextrd $3, %xmm0, %ebx | |
vpshufd $255, %xmm3, %xmm0 # xmm0 = xmm3[3,3,3,3] | |
vpbroadcastq %xmm0, %zmm29 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %ebx, %rbx | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm29, %xmm0 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rbx,4), %rbx | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm0, %r11d | |
vpmulld %xmm10, %xmm22, %xmm0 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %r11d, %r11 | |
leaq (%r13,%r11,4), %r11 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm0, %r8d | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovddup .LCPI0_194(%rip), %xmm0 # xmm0 = [4,0,4,0] | |
# xmm0 = mem[0,0] | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %r8d, %r8 | |
leaq (%r13,%r8,4), %r8 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpermi2ps %xmm28, %xmm16, %xmm0 | |
vinsertf128 $1, %xmm0, %ymm0, %ymm0 | |
vinsertf128 $1, %xmm5, %ymm0, %ymm5 | |
vblendps $192, %ymm0, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] | |
vmovlhps %xmm12, %xmm24, %xmm5 # xmm5 = xmm24[0],xmm12[0] | |
vshufps $36, %xmm5, %xmm4, %xmm5 # xmm5 = xmm4[0,1],xmm5[2,0] | |
vblendps $15, %ymm5, %ymm0, %ymm0 # ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] | |
vinsertf64x4 $0, %ymm0, %zmm2, %zmm13 | |
vunpcklps %xmm24, %xmm12, %xmm0 # xmm0 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] | |
vinsertf128 $1, %xmm7, %ymm0, %ymm5 | |
vunpcklps %xmm16, %xmm28, %xmm7 # xmm7 = xmm28[0],xmm16[0],xmm28[1],xmm16[1] | |
vblendps $3, %xmm11, %xmm0, %xmm2 # xmm2 = xmm11[0,1],xmm0[2,3] | |
vinsertf128 $1, %xmm7, %ymm0, %ymm7 | |
vblendps $192, %ymm7, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm7[6,7] | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpshufd $85, %ymm3, %ymm5 # ymm5 = ymm3[1,1,1,1,5,5,5,5] | |
vextracti128 $1, %ymm5, %xmm7 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendps $15, %ymm2, %ymm0, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] | |
vinsertps $179, %xmm24, %xmm12, %xmm2 # xmm2 = zero,zero,xmm12[2],xmm24[2] | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm7, %xmm7 | |
vpextrd $3, %xmm7, %r10d | |
vpshufd $170, %ymm3, %ymm7 # ymm7 = ymm3[2,2,2,2,6,6,6,6] | |
vextracti128 $1, %ymm7, %xmm11 | |
vpmulld %xmm10, %xmm11, %xmm11 | |
vpextrd $3, %xmm11, %r9d | |
vmovdqa 1392(%rsp), %xmm11 # 16-byte Reload | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %r9d, %r9 | |
leaq (%r13,%r9,4), %r9 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpbroadcastd %xmm11, %zmm19 | |
vpmulld %xmm10, %xmm19, %xmm6 | |
vpextrd $3, %xmm6, %esi | |
vextracti32x4 $2, %zmm20, %xmm6 | |
vpmulld %xmm10, %xmm6, %xmm6 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %esi, %rsi | |
leaq (%r13,%rsi,4), %rsi | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm6, %edx | |
vpshufd $170, %zmm3, %zmm6 # zmm6 = zmm3[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] | |
vextracti32x4 $2, %zmm6, %xmm4 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %edx, %rdx | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm4, %xmm4 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rdx,4), %rdx | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm4, %ecx | |
vmovupd 96(%rsp), %zmm4 # 64-byte Reload | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %ecx, %rcx | |
leaq (%r13,%rcx,4), %rcx | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovapd %zmm4, %zmm14 {%k1} | |
vmovaps %xmm27, %xmm4 | |
vinsertf64x4 $0, %ymm0, %zmm14, %zmm0 | |
vblendps $3, %xmm4, %xmm2, %xmm2 # xmm2 = xmm4[0,1],xmm2[2,3] | |
vinsertf128 $1, 2032(%rsp), %ymm0, %ymm4 # 16-byte Folded Reload | |
vinsertps $179, %xmm16, %xmm28, %xmm27 # xmm27 = zero,zero,xmm28[2],xmm16[2] | |
vinsertf32x4 $1, %xmm27, %ymm0, %ymm14 | |
vmovupd 1840(%rsp), %zmm27 # 64-byte Reload | |
vblendpd $8, %ymm14, %ymm4, %ymm14 # ymm14 = ymm4[0,1,2],ymm14[3] | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpbroadcastd %r14d, %zmm4 | |
xorl %r14d, %r14d | |
.loc 1 238 58 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:58 | |
cmpl 488(%rsp), %r12d # 4-byte Folded Reload | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm9, %zmm4, %k3 | |
vpcmpgtd %zmm31, %zmm4, %k2 | |
vpcmpgtd %zmm29, %zmm4, %k5 | |
vpcmpgtd %zmm8, %zmm4, %k4 | |
vmovupd 288(%rsp), %zmm9 # 64-byte Reload | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendpd $3, %ymm2, %ymm14, %ymm2 # ymm2 = ymm2[0,1],ymm14[2,3] | |
vmovaps 2096(%rsp), %xmm14 # 16-byte Reload | |
vmovupd 2352(%rsp), %zmm29 # 64-byte Reload | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
kunpckwd %k2, %k3, %k2 | |
kunpckwd %k4, %k5, %k3 | |
kunpckdq %k2, %k3, %k2 | |
.loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39 | |
kshiftrq $15, %k2, %k2 | |
cmovgeq %r14, %r15 | |
.loc 1 236 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rax,4), %r14 | |
.loc 1 236 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:52 | |
movslq %r12d, %rax | |
.loc 1 238 39 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:39 | |
kmovq %r15, %k0 | |
kandq %k0, %k2, %k2 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpmovm2d %k2, %zmm8 | |
vpbroadcastd %xmm8, %zmm8 | |
kshiftrq $32, %k2, %k4 | |
vpmovd2m %zmm8, %k3 | |
vmovupd 160(%rsp), %zmm8 # 64-byte Reload | |
vmovups %zmm13, (%r14,%rax,4) {%k3} | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %ebp, %r14 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
kshiftrq $16, %k2, %k3 | |
kshiftrq $48, %k2, %k2 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%r14,4), %r14 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovapd %zmm9, %zmm8 {%k1} | |
vunpckhps %xmm16, %xmm28, %xmm9 # xmm9 = xmm28[2],xmm16[2],xmm28[3],xmm16[3] | |
vinsertf64x4 $0, %ymm2, %zmm8, %zmm8 | |
vinsertf32x4 $1, %xmm17, %ymm0, %ymm2 | |
vpmovm2d %k3, %zmm17 | |
vpbroadcastd %xmm17, %zmm17 | |
vinsertf128 $1, %xmm9, %ymm0, %ymm9 | |
vpmovd2m %zmm17, %k3 | |
vmovups 1968(%rsp), %zmm17 # 64-byte Reload | |
vmovups %zmm0, (%r14,%rax,4) {%k3} | |
vpmovm2d %k4, %zmm0 | |
vpbroadcastd %xmm0, %zmm0 | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm22, %zmm4, %k3 | |
vmovups 880(%rsp), %zmm22 # 64-byte Reload | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendpd $8, %ymm9, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm9[3] | |
vunpckhps %xmm24, %xmm12, %xmm9 # xmm9 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] | |
vpmovd2m %zmm0, %k4 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vshufi64x2 $85, %zmm5, %zmm5, %zmm0 # zmm0 = zmm5[2,3,2,3,2,3,2,3] | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vunpckhps %ymm24, %ymm12, %ymm5 # ymm5 = ymm12[2],ymm24[2],ymm12[3],ymm24[3],ymm12[6],ymm24[6],ymm12[7],ymm24[7] | |
vshufps $226, %xmm9, %xmm14, %xmm9 # xmm9 = xmm14[2,0],xmm9[2,3] | |
vmovups 784(%rsp), %ymm14 # 32-byte Reload | |
vmovups %zmm8, (%rbx,%rax,4) {%k4} | |
vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2] | |
vblendpd $3, %ymm9, %ymm2, %ymm2 # ymm2 = ymm9[0,1],ymm2[2,3] | |
vmovupd 416(%rsp), %zmm9 # 64-byte Reload | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm0, %zmm4, %k4 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpmovm2d %k2, %zmm0 | |
vpbroadcastd %xmm0, %zmm0 | |
vpmovd2m %zmm0, %k2 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vshufi64x2 $85, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,3,2,3,2,3,2,3] | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
kunpckwd %k3, %k4, %k3 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovaps %zmm12, %zmm7 | |
vpermt2ps %zmm24, %zmm15, %zmm7 | |
vinsertf64x4 $0, %ymm2, %zmm9, %zmm13 | |
vmovups 2224(%rsp), %ymm2 # 32-byte Reload | |
vunpcklps %ymm24, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm24[0],ymm12[1],ymm24[1],ymm12[4],ymm24[4],ymm12[5],ymm24[5] | |
vpermpd $170, %ymm9, %ymm9 # ymm9 = ymm9[2,2,2,2] | |
vmovups %zmm13, (%r11,%rax,4) {%k2} | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm0, %zmm4, %k2 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpshufd $255, %ymm3, %ymm0 # ymm0 = ymm3[3,3,3,3,7,7,7,7] | |
vshufi64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3] | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm0, %zmm4, %k5 | |
kunpckwd %k2, %k5, %k2 | |
kunpckdq %k3, %k2, %k2 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vextractf128 $1, %ymm2, %xmm2 | |
.loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39 | |
kshiftrq $15, %k2, %k2 | |
kandq %k0, %k2, %k5 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3] | |
vunpcklpd %ymm28, %ymm16, %ymm9 # ymm9 = ymm16[0],ymm28[0],ymm16[2],ymm28[2] | |
vpmovm2d %k5, %zmm0 | |
vpbroadcastd %xmm0, %zmm0 | |
kshiftrq $32, %k5, %k6 | |
vshufps $36, %ymm9, %ymm14, %ymm9 # ymm9 = ymm14[0,1],ymm9[2,0],ymm14[4,5],ymm9[6,4] | |
vpmovd2m %zmm0, %k2 | |
vblendps $15, %ymm2, %ymm9, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] | |
vmovups 560(%rsp), %zmm9 # 64-byte Reload | |
vinsertf64x4 $0, %ymm2, %zmm9, %zmm14 | |
vmovddup .LCPI0_30(%rip), %xmm9 # xmm9 = [5,13,5,13] | |
# xmm9 = mem[0,0] | |
vmovups 2288(%rsp), %ymm2 # 32-byte Reload | |
vmovups %zmm14, (%r8,%rax,4) {%k2} | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %r10d, %r8 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
kshiftrq $16, %k5, %k2 | |
kshiftrq $48, %k5, %k5 | |
vmovups 1200(%rsp), %zmm14 # 64-byte Reload | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%r8,4), %r8 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpermi2ps %ymm24, %ymm12, %ymm9 | |
vextractf128 $1, %ymm2, %xmm2 | |
vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3] | |
vunpcklps %ymm16, %ymm28, %ymm9 # ymm9 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5] | |
vblendps $63, 2160(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload | |
# ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] | |
vblendps $15, %ymm2, %ymm9, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] | |
vmovups 688(%rsp), %zmm2 # 64-byte Reload | |
vunpcklps %zmm16, %zmm28, %zmm9 # zmm9 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13] | |
vinsertf64x4 $0, %ymm0, %zmm2, %zmm0 | |
vpmovm2d %k2, %zmm2 | |
vpbroadcastd %xmm2, %zmm2 | |
vpmovd2m %zmm2, %k2 | |
vmovups %zmm0, (%r8,%rax,4) {%k2} | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm11, %xmm0 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovapd .LCPI0_20(%rip), %zmm11 # zmm11 = [0,8,0,8,4,12,4,13] | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm0, %r8d | |
vshufi64x2 $255, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[6,7,6,7,6,7,6,7] | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %r8d, %r8 | |
.loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm0, %zmm4, %k2 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vshufi64x2 $255, %zmm6, %zmm6, %zmm0 # zmm0 = zmm6[6,7,6,7,6,7,6,7] | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%r8,4), %r8 | |
.loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm0, %zmm4, %k3 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpshufd $255, %zmm3, %zmm0 # zmm0 = zmm3[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] | |
vmovups 752(%rsp), %ymm3 # 32-byte Reload | |
vshufi64x2 $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[6,7,6,7,6,7,6,7] | |
vshufi64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5] | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm2, %zmm4, %k4 | |
vmovups 1456(%rsp), %ymm2 # 32-byte Reload | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vextractf128 $1, %ymm2, %xmm2 | |
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
vunpckhpd %ymm28, %ymm16, %ymm5 # ymm5 = ymm16[1],ymm28[1],ymm16[3],ymm28[3] | |
vshufps $36, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0,1],ymm5[2,0],ymm3[4,5],ymm5[6,4] | |
vmovups 624(%rsp), %zmm3 # 64-byte Reload | |
vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] | |
vpmovm2d %k6, %zmm5 | |
vpbroadcastd %xmm5, %zmm5 | |
vpmovd2m %zmm5, %k6 | |
vmovddup .LCPI0_13(%rip), %xmm5 # xmm5 = [7,15,7,15] | |
# xmm5 = mem[0,0] | |
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
vmovups 1072(%rsp), %zmm3 # 64-byte Reload | |
vmovups %zmm2, (%r9,%rax,4) {%k6} | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vshufi64x2 $170, %zmm20, %zmm20, %zmm2 # zmm2 = zmm20[4,5,4,5,4,5,4,5] | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vpermi2ps %ymm24, %ymm12, %ymm5 | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm19, %zmm4, %k6 | |
vmovupd 1904(%rsp), %zmm19 # 64-byte Reload | |
vpcmpgtd %zmm2, %zmm4, %k7 | |
vmovups 1712(%rsp), %ymm2 # 32-byte Reload | |
kunpckwd %k6, %k7, %k6 | |
vpcmpgtd %zmm0, %zmm4, %k7 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vmovdqa64 %zmm25, %zmm0 | |
vpermt2ps %zmm18, %zmm15, %zmm0 | |
vpermi2ps %zmm16, %zmm28, %zmm15 | |
vextractf128 $1, %ymm2, %xmm2 | |
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
vunpckhps %ymm16, %ymm28, %ymm5 # ymm5 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7] | |
vblendps $63, 1520(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload | |
# ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] | |
vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] | |
vpmovm2d %k5, %zmm5 | |
vpbroadcastd %xmm5, %zmm5 | |
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
vpmovd2m %zmm5, %k5 | |
vmovupd 1584(%rsp), %zmm3 # 64-byte Reload | |
vmovups %zmm2, (%rdi,%rax,4) {%k5} | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vshufi64x2 $170, %zmm6, %zmm6, %zmm2 # zmm2 = zmm6[4,5,4,5,4,5,4,5] | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm2, %zmm4, %k5 | |
vmovupd 1264(%rsp), %zmm2 # 64-byte Reload | |
kunpckwd %k5, %k7, %k5 | |
kunpckdq %k6, %k5, %k5 | |
.loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39 | |
kshiftrq $15, %k5, %k5 | |
kandq %k0, %k5, %k5 | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vshufpd $128, %zmm0, %zmm2, %zmm29 {%k1} # zmm29 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
vpermt2pd %zmm0, %zmm11, %zmm2 | |
vextractf32x4 $2, %zmm7, %xmm0 | |
vmovapd %zmm2, %zmm30 {%k1} | |
vextractf32x4 $2, %zmm22, %xmm2 | |
vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3] | |
vshuff64x2 $170, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[4,5,4,5,4,5,4,5] | |
vshuff64x2 $170, %zmm27, %zmm27, %zmm5 # zmm5 = zmm27[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm2, %ymm5, %ymm2 # ymm2 = ymm5[0,1,2],ymm2[3] | |
vblendpd $3, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0,1],ymm2[2,3] | |
vpmovm2d %k5, %zmm2 | |
vpbroadcastd %xmm2, %zmm2 | |
vpmovd2m %zmm2, %k6 | |
vmovupd 352(%rsp), %zmm2 # 64-byte Reload | |
vinsertf64x4 $0, %ymm0, %zmm30, %zmm0 | |
vmovupd 816(%rsp), %zmm30 # 64-byte Reload | |
vmovups %zmm0, (%rsi,%rax,4) {%k6} | |
vpunpckldq %zmm18, %zmm25, %zmm0 # zmm0 = zmm25[0],zmm18[0],zmm25[1],zmm18[1],zmm25[4],zmm18[4],zmm25[5],zmm18[5],zmm25[8],zmm18[8],zmm25[9],zmm18[9],zmm25[12],zmm18[12],zmm25[13],zmm18[13] | |
kshiftrq $16, %k5, %k6 | |
vshufpd $128, %zmm0, %zmm2, %zmm23 {%k1} # zmm23 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7] | |
vpermt2pd %zmm0, %zmm11, %zmm2 | |
vunpcklps %zmm24, %zmm12, %zmm0 # zmm0 = zmm12[0],zmm24[0],zmm12[1],zmm24[1],zmm12[4],zmm24[4],zmm12[5],zmm24[5],zmm12[8],zmm24[8],zmm12[9],zmm24[9],zmm12[12],zmm24[12],zmm12[13],zmm24[13] | |
vextractf32x4 $2, %zmm0, %xmm5 | |
vshuff64x2 $170, %zmm9, %zmm9, %zmm8 # zmm8 = zmm9[4,5,4,5,4,5,4,5] | |
vmovapd %zmm2, %zmm3 {%k1} | |
vextractf32x4 $2, %zmm17, %xmm2 | |
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3] | |
vshuff64x2 $170, %zmm19, %zmm19, %zmm5 # zmm5 = zmm19[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm8, %ymm5, %ymm5 # ymm5 = ymm5[0,1,2],ymm8[3] | |
vextractf32x4 $2, %zmm14, %xmm8 | |
vblendpd $3, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1],ymm5[2,3] | |
vpmovm2d %k6, %zmm5 | |
vpbroadcastd %xmm5, %zmm5 | |
vpmovd2m %zmm5, %k6 | |
vmovupd 1328(%rsp), %zmm5 # 64-byte Reload | |
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
vmovupd 1648(%rsp), %zmm3 # 64-byte Reload | |
vmovups %zmm2, (%rdx,%rax,4) {%k6} | |
vmovdqa64 %zmm25, %zmm2 | |
vpermt2ps %zmm18, %zmm21, %zmm2 | |
kshiftrq $32, %k5, %k6 | |
vshufpd $128, %zmm2, %zmm5, %zmm30 {%k1} # zmm30 {%k1} = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[7] | |
vpermt2pd %zmm2, %zmm11, %zmm5 | |
vpunpckhdq %zmm18, %zmm25, %zmm2 # zmm2 = zmm25[2],zmm18[2],zmm25[3],zmm18[3],zmm25[6],zmm18[6],zmm25[7],zmm18[7],zmm25[10],zmm18[10],zmm25[11],zmm18[11],zmm25[14],zmm18[14],zmm25[15],zmm18[15] | |
vmovupd 944(%rsp), %zmm25 # 64-byte Reload | |
vmovupd 1136(%rsp), %zmm18 # 64-byte Reload | |
vpermi2pd %zmm2, %zmm26, %zmm11 | |
vmovapd %zmm5, %zmm3 {%k1} | |
vmovaps %zmm12, %zmm5 | |
vpermt2ps %zmm24, %zmm21, %zmm5 | |
vpermi2ps %zmm16, %zmm28, %zmm21 | |
vshufpd $128, %zmm2, %zmm26, %zmm25 {%k1} # zmm25 {%k1} = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[7] | |
vshuff64x2 $170, %zmm18, %zmm18, %zmm13 # zmm13 = zmm18[4,5,4,5,4,5,4,5] | |
vextractf32x4 $2, %zmm5, %xmm2 | |
vblendps $3, %xmm8, %xmm2, %xmm2 # xmm2 = xmm8[0,1],xmm2[2,3] | |
vshuff64x2 $170, %zmm21, %zmm21, %zmm8 # zmm8 = zmm21[4,5,4,5,4,5,4,5] | |
vblendpd $8, %ymm8, %ymm13, %ymm8 # ymm8 = ymm13[0,1,2],ymm8[3] | |
vmovupd 1776(%rsp), %zmm13 # 64-byte Reload | |
vblendpd $3, %ymm2, %ymm8, %ymm2 # ymm2 = ymm2[0,1],ymm8[2,3] | |
vpmovm2d %k6, %zmm8 | |
vpbroadcastd %xmm8, %zmm8 | |
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2 | |
vmovdqa 1408(%rsp), %xmm3 # 16-byte Reload | |
vpmovd2m %zmm8, %k6 | |
vmovups %zmm2, (%rcx,%rax,4) {%k6} | |
vmovapd %zmm11, %zmm13 {%k1} | |
vunpckhps %zmm16, %zmm28, %zmm11 # zmm11 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15] | |
kshiftrq $48, %k5, %k1 | |
vshuff64x2 $170, %zmm11, %zmm11, %zmm8 # zmm8 = zmm11[4,5,4,5,4,5,4,5] | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpbroadcastd %xmm3, %zmm2 | |
vpmulld %xmm10, %xmm3, %xmm3 | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
vpcmpgtd %zmm2, %zmm4, %k6 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm2, %xmm2 | |
vpextrd $3, %xmm3, %ecx | |
vpextrd $3, %xmm2, %edi | |
vextracti32x4 $3, %zmm20, %xmm2 | |
vmovups 1008(%rsp), %zmm20 # 64-byte Reload | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %ecx, %rcx | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm2, %xmm2 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %edi, %rdi | |
leaq (%r13,%rcx,4), %rcx | |
leaq (%r13,%rdi,4), %rdi | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm2, %esi | |
vextracti32x4 $3, %zmm6, %xmm2 | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vshuff64x2 $170, %zmm1, %zmm1, %zmm6 # zmm6 = zmm1[4,5,4,5,4,5,4,5] | |
vextractf32x4 $3, %zmm0, %xmm0 | |
vextractf64x4 $1, %zmm1, %ymm1 | |
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33 | |
vpmulld %xmm10, %xmm2, %xmm2 | |
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %esi, %rsi | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3] | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rsi,4), %rsi | |
.loc 1 236 33 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:33 | |
vpextrd $3, %xmm2, %edx | |
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21 | |
vunpckhps %zmm24, %zmm12, %zmm2 # zmm2 = zmm12[2],zmm24[2],zmm12[3],zmm24[3],zmm12[6],zmm24[6],zmm12[7],zmm24[7],zmm12[10],zmm24[10],zmm12[11],zmm24[11],zmm12[14],zmm24[14],zmm12[15],zmm24[15] | |
vextractf32x4 $2, %zmm20, %xmm3 | |
vextractf32x4 $2, %zmm2, %xmm4 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
movslq %edx, %rdx | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vextractf32x4 $3, %zmm2, %xmm2 | |
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21 | |
leaq (%r13,%rdx,4), %rdx | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendps $3, %xmm3, %xmm4, %xmm4 # xmm4 = xmm3[0,1],xmm4[2,3] | |
vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3] | |
vpmovm2d %k1, %zmm6 | |
vpbroadcastd %xmm6, %zmm6 | |
vinsertf64x4 $0, %ymm4, %zmm13, %zmm4 | |
vpmovd2m %zmm6, %k1 | |
vextractf32x4 $3, %zmm22, %xmm6 | |
vmovups %zmm4, (%r8,%rax,4) {%k1} | |
vextractf32x4 $3, %zmm7, %xmm4 | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
kunpckwd %k6, %k2, %k1 | |
kunpckwd %k3, %k4, %k2 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vextractf64x4 $1, %zmm27, %ymm7 | |
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33 | |
kunpckdq %k1, %k2, %k1 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendps $3, %xmm6, %xmm4, %xmm4 # xmm4 = xmm6[0,1],xmm4[2,3] | |
vextractf64x4 $1, %zmm15, %ymm6 | |
.loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39 | |
kshiftrq $15, %k1, %k1 | |
kandq %k0, %k1, %k0 | |
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21 | |
vblendpd $8, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0,1,2],ymm6[3] | |
vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3] | |
vpmovm2d %k0, %zmm6 | |
vpbroadcastd %xmm6, %zmm6 | |
vinsertf64x4 $0, %ymm4, %zmm29, %zmm4 | |
vpmovd2m %zmm6, %k1 | |
vextractf64x4 $1, %zmm9, %ymm6 | |
vmovups %zmm4, (%rdi,%rax,4) {%k1} | |
vextractf32x4 $3, %zmm17, %xmm4 | |
kshiftrq $16, %k0, %k1 | |
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
vextractf64x4 $1, %zmm19, %ymm4 | |
vblendpd $8, %ymm6, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2],ymm6[3] | |
vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3] | |
vpmovm2d %k1, %zmm4 | |
vpbroadcastd %xmm4, %zmm4 | |
vinsertf64x4 $0, %ymm0, %zmm23, %zmm0 | |
vpmovd2m %zmm4, %k1 | |
vextractf32x4 $3, %zmm14, %xmm4 | |
vmovups %zmm0, (%rsi,%rax,4) {%k1} | |
vextractf32x4 $3, %zmm5, %xmm0 | |
vextractf64x4 $1, %zmm18, %ymm5 | |
kshiftrq $32, %k0, %k1 | |
kshiftrq $48, %k0, %k0 | |
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3] | |
vextractf64x4 $1, %zmm21, %ymm4 | |
vblendpd $8, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0,1,2],ymm4[3] | |
vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3] | |
vpmovm2d %k1, %zmm4 | |
vpbroadcastd %xmm4, %zmm4 | |
vinsertf64x4 $0, %ymm0, %zmm30, %zmm0 | |
vpmovd2m %zmm4, %k1 | |
vmovups %zmm0, (%rdx,%rax,4) {%k1} | |
vextractf32x4 $3, %zmm20, %xmm0 | |
vblendps $3, %xmm0, %xmm2, %xmm0 # xmm0 = xmm0[0,1],xmm2[2,3] | |
vextractf64x4 $1, %zmm11, %ymm2 | |
vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] | |
vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] | |
vpmovm2d %k0, %zmm1 | |
vpbroadcastd %xmm1, %zmm1 | |
vinsertf64x4 $0, %ymm0, %zmm25, %zmm0 | |
vpmovd2m %zmm1, %k1 | |
vmovups %zmm0, (%rcx,%rax,4) {%k1} | |
.loc 1 239 4 epilogue_begin is_stmt 0 # 03-matrix-multiplication-cpu.py:239:4 | |
addq $3448, %rsp # imm = 0xD78 | |
.cfi_def_cfa_offset 56 | |
popq %rbx | |
.cfi_def_cfa_offset 48 | |
popq %r12 | |
.cfi_def_cfa_offset 40 | |
popq %r13 | |
.cfi_def_cfa_offset 32 | |
popq %r14 | |
.cfi_def_cfa_offset 24 | |
popq %r15 | |
.cfi_def_cfa_offset 16 | |
popq %rbp | |
.cfi_def_cfa_offset 8 | |
vzeroupper | |
retq | |
.Ltmp12: | |
.Lfunc_end0: | |
.size matmul_kernel, .Lfunc_end0-matmul_kernel | |
.cfi_endproc | |
# -- End function | |
.section .debug_abbrev,"",@progbits | |
.byte 1 # Abbreviation Code | |
.byte 17 # DW_TAG_compile_unit | |
.byte 1 # DW_CHILDREN_yes | |
.byte 37 # DW_AT_producer | |
.byte 14 # DW_FORM_strp | |
.byte 19 # DW_AT_language | |
.byte 5 # DW_FORM_data2 | |
.byte 3 # DW_AT_name | |
.byte 14 # DW_FORM_strp | |
.byte 16 # DW_AT_stmt_list | |
.byte 23 # DW_FORM_sec_offset | |
.byte 27 # DW_AT_comp_dir | |
.byte 14 # DW_FORM_strp | |
.byte 17 # DW_AT_low_pc | |
.byte 1 # DW_FORM_addr | |
.byte 18 # DW_AT_high_pc | |
.byte 6 # DW_FORM_data4 | |
.byte 0 # EOM(1) | |
.byte 0 # EOM(2) | |
.byte 2 # Abbreviation Code | |
.byte 46 # DW_TAG_subprogram | |
.byte 0 # DW_CHILDREN_no | |
.byte 3 # DW_AT_name | |
.byte 14 # DW_FORM_strp | |
.byte 32 # DW_AT_inline | |
.byte 11 # DW_FORM_data1 | |
.byte 0 # EOM(1) | |
.byte 0 # EOM(2) | |
.byte 3 # Abbreviation Code | |
.byte 46 # DW_TAG_subprogram | |
.byte 1 # DW_CHILDREN_yes | |
.byte 17 # DW_AT_low_pc | |
.byte 1 # DW_FORM_addr | |
.byte 18 # DW_AT_high_pc | |
.byte 6 # DW_FORM_data4 | |
.byte 49 # DW_AT_abstract_origin | |
.byte 19 # DW_FORM_ref4 | |
.byte 0 # EOM(1) | |
.byte 0 # EOM(2) | |
.byte 4 # Abbreviation Code | |
.byte 29 # DW_TAG_inlined_subroutine | |
.byte 0 # DW_CHILDREN_no | |
.byte 49 # DW_AT_abstract_origin | |
.byte 19 # DW_FORM_ref4 | |
.byte 85 # DW_AT_ranges | |
.byte 23 # DW_FORM_sec_offset | |
.byte 88 # DW_AT_call_file | |
.byte 11 # DW_FORM_data1 | |
.byte 89 # DW_AT_call_line | |
.byte 11 # DW_FORM_data1 | |
.byte 87 # DW_AT_call_column | |
.byte 11 # DW_FORM_data1 | |
.byte 0 # EOM(1) | |
.byte 0 # EOM(2) | |
.byte 0 # EOM(3) | |
.section .debug_info,"",@progbits | |
.Lcu_begin0: | |
.long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit | |
.Ldebug_info_start0: | |
.short 4 # DWARF version number | |
.long .debug_abbrev # Offset Into Abbrev. Section | |
.byte 8 # Address Size (in bytes) | |
.byte 1 # Abbrev [1] 0xb:0x5c DW_TAG_compile_unit | |
.long .Linfo_string0 # DW_AT_producer | |
.short 2 # DW_AT_language | |
.long .Linfo_string1 # DW_AT_name | |
.long .Lline_table_start0 # DW_AT_stmt_list | |
.long .Linfo_string2 # DW_AT_comp_dir | |
.quad .Lfunc_begin0 # DW_AT_low_pc | |
.long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc | |
.byte 2 # Abbrev [2] 0x2a:0x6 DW_TAG_subprogram | |
.long .Linfo_string3 # DW_AT_name | |
.byte 1 # DW_AT_inline | |
.byte 3 # Abbrev [3] 0x30:0x36 DW_TAG_subprogram | |
.quad .Lfunc_begin0 # DW_AT_low_pc | |
.long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc | |
.long 42 # DW_AT_abstract_origin | |
.byte 4 # Abbrev [4] 0x41:0xc DW_TAG_inlined_subroutine | |
.long 42 # DW_AT_abstract_origin | |
.long .Ldebug_ranges0 # DW_AT_ranges | |
.byte 1 # DW_AT_call_file | |
.byte 189 # DW_AT_call_line | |
.byte 27 # DW_AT_call_column | |
.byte 4 # Abbrev [4] 0x4d:0xc DW_TAG_inlined_subroutine | |
.long 42 # DW_AT_abstract_origin | |
.long .Ldebug_ranges1 # DW_AT_ranges | |
.byte 1 # DW_AT_call_file | |
.byte 190 # DW_AT_call_line | |
.byte 27 # DW_AT_call_column | |
.byte 4 # Abbrev [4] 0x59:0xc DW_TAG_inlined_subroutine | |
.long 42 # DW_AT_abstract_origin | |
.long .Ldebug_ranges2 # DW_AT_ranges | |
.byte 1 # DW_AT_call_file | |
.byte 217 # DW_AT_call_line | |
.byte 33 # DW_AT_call_column | |
.byte 0 # End Of Children Mark | |
.byte 0 # End Of Children Mark | |
.Ldebug_info_end0: | |
.section .debug_ranges,"",@progbits | |
.Ldebug_ranges0: | |
.quad .Ltmp0-.Lfunc_begin0 | |
.quad .Ltmp1-.Lfunc_begin0 | |
.quad .Ltmp2-.Lfunc_begin0 | |
.quad .Ltmp3-.Lfunc_begin0 | |
.quad .Ltmp4-.Lfunc_begin0 | |
.quad .Ltmp5-.Lfunc_begin0 | |
.quad 0 | |
.quad 0 | |
.Ldebug_ranges1: | |
.quad .Ltmp1-.Lfunc_begin0 | |
.quad .Ltmp2-.Lfunc_begin0 | |
.quad .Ltmp3-.Lfunc_begin0 | |
.quad .Ltmp4-.Lfunc_begin0 | |
.quad .Ltmp5-.Lfunc_begin0 | |
.quad .Ltmp6-.Lfunc_begin0 | |
.quad 0 | |
.quad 0 | |
.Ldebug_ranges2: | |
.quad .Ltmp7-.Lfunc_begin0 | |
.quad .Ltmp8-.Lfunc_begin0 | |
.quad .Ltmp9-.Lfunc_begin0 | |
.quad .Ltmp10-.Lfunc_begin0 | |
.quad 0 | |
.quad 0 | |
.section .debug_str,"MS",@progbits,1 | |
.Linfo_string0: | |
.asciz "triton" # string offset=0 | |
.Linfo_string1: | |
.asciz "03-matrix-multiplication-cpu.py" # string offset=7 | |
.Linfo_string2: | |
.asciz "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" # string offset=39 | |
.Linfo_string3: | |
.asciz "matmul_kernel" # string offset=98 | |
.section ".note.GNU-stack","",@progbits | |
.section .debug_line,"",@progbits | |
.Lline_table_start0: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment