Skip to content

Instantly share code, notes, and snippets.

@minjang
Last active November 20, 2024 07:57
Show Gist options
  • Save minjang/b01096455fc71f50715af39d02c4b190 to your computer and use it in GitHub Desktop.
Save minjang/b01096455fc71f50715af39d02c4b190 to your computer and use it in GitHub Desktop.
x86-64 (AVX512) for matmul_kernel (03-matrix-multiplication-cpu.py) from TTMIR
.text
.file "LLVMDialectModule"
.section .rodata,"a",@progbits
.p2align 6, 0x0 # -- Begin function matmul_kernel
.LCPI0_0:
.zero 4
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 15 # 0xf
.LCPI0_15:
.long 3 # 0x3
.long 19 # 0x13
.long 2 # 0x2
.long 3 # 0x3
.long 7 # 0x7
.long 23 # 0x17
.long 6 # 0x6
.long 7 # 0x7
.long 11 # 0xb
.long 27 # 0x1b
.long 10 # 0xa
.long 11 # 0xb
.long 15 # 0xf
.long 31 # 0x1f
.long 14 # 0xe
.long 15 # 0xf
.LCPI0_16:
.quad 2 # 0x2
.quad 10 # 0xa
.quad 2 # 0x2
.quad 10 # 0xa
.quad 6 # 0x6
.quad 15 # 0xf
.quad 6 # 0x6
.quad 14 # 0xe
.LCPI0_17:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 18 # 0x12
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 22 # 0x16
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 26 # 0x1a
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 30 # 0x1e
.LCPI0_18:
.long 1 # 0x1
.long 17 # 0x11
.long 2 # 0x2
.long 3 # 0x3
.long 5 # 0x5
.long 21 # 0x15
.long 6 # 0x6
.long 7 # 0x7
.long 9 # 0x9
.long 25 # 0x19
.long 10 # 0xa
.long 11 # 0xb
.long 13 # 0xd
.long 29 # 0x1d
.long 14 # 0xe
.long 15 # 0xf
.LCPI0_19:
.long 0 # 0x0
.long 1 # 0x1
.long 0 # 0x0
.long 16 # 0x10
.long 4 # 0x4
.long 5 # 0x5
.long 4 # 0x4
.long 20 # 0x14
.long 8 # 0x8
.long 9 # 0x9
.long 8 # 0x8
.long 24 # 0x18
.long 12 # 0xc
.long 13 # 0xd
.long 12 # 0xc
.long 28 # 0x1c
.LCPI0_20:
.quad 0 # 0x0
.quad 8 # 0x8
.quad 0 # 0x0
.quad 8 # 0x8
.quad 4 # 0x4
.quad 12 # 0xc
.quad 4 # 0x4
.quad 13 # 0xd
.LCPI0_40:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 25 # 0x19
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_47:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 13 # 0xd
.zero 8
.zero 8
.zero 8
.LCPI0_48:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 26 # 0x1a
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_55:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 27 # 0x1b
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_56:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 27 # 0x1b
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_57:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 27 # 0x1b
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_64:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 28 # 0x1c
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_65:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 14 # 0xe
.zero 8
.zero 8
.LCPI0_66:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 28 # 0x1c
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_73:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_74:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_75:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_76:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_77:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.LCPI0_84:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 15 # 0xf
.zero 8
.zero 8
.zero 8
.LCPI0_85:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 30 # 0x1e
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_86:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 15 # 0xf
.zero 8
.zero 8
.LCPI0_87:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 30 # 0x1e
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_88:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 15 # 0xf
.zero 8
.LCPI0_89:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 30 # 0x1e
.zero 4
.zero 4
.LCPI0_96:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_97:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_98:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_99:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_100:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.LCPI0_101:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 31 # 0x1f
.zero 4
.zero 4
.LCPI0_102:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 31 # 0x1f
.zero 4
.LCPI0_109:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 16 # 0x10
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_110:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 17 # 0x11
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_111:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 17 # 0x11
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_112:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 17 # 0x11
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_113:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 9 # 0x9
.zero 8
.zero 8
.zero 8
.LCPI0_114:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 18 # 0x12
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_115:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 19 # 0x13
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_116:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 19 # 0x13
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_117:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 19 # 0x13
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_119:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 20 # 0x14
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_123:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 21 # 0x15
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_124:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 21 # 0x15
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_125:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 21 # 0x15
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_126:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 21 # 0x15
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_127:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 21 # 0x15
.zero 4
.zero 4
.zero 4
.LCPI0_128:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 21 # 0x15
.zero 4
.zero 4
.LCPI0_129:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 21 # 0x15
.zero 4
.LCPI0_130:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 21 # 0x15
.LCPI0_134:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 11 # 0xb
.zero 8
.zero 8
.zero 8
.LCPI0_135:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 22 # 0x16
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_136:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 11 # 0xb
.zero 8
.zero 8
.LCPI0_137:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 22 # 0x16
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_138:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 11 # 0xb
.zero 8
.LCPI0_139:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 22 # 0x16
.zero 4
.zero 4
.LCPI0_140:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 6 # 0x6
.quad 11 # 0xb
.LCPI0_141:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 22 # 0x16
.LCPI0_145:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 23 # 0x17
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_146:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 23 # 0x17
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_147:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 23 # 0x17
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_148:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 23 # 0x17
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_149:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 23 # 0x17
.zero 4
.zero 4
.zero 4
.LCPI0_150:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 23 # 0x17
.zero 4
.zero 4
.LCPI0_151:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 23 # 0x17
.zero 4
.LCPI0_152:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 23 # 0x17
.LCPI0_159:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 24 # 0x18
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_160:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 12 # 0xc
.zero 8
.zero 8
.LCPI0_161:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 24 # 0x18
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_162:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 12 # 0xc
.zero 8
.LCPI0_163:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 24 # 0x18
.zero 4
.zero 4
.LCPI0_164:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 6 # 0x6
.quad 12 # 0xc
.LCPI0_165:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 24 # 0x18
.LCPI0_166:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 25 # 0x19
.zero 4
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_167:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 25 # 0x19
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_168:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 25 # 0x19
.zero 4
.zero 4
.zero 4
.LCPI0_169:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 25 # 0x19
.zero 4
.zero 4
.LCPI0_170:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 25 # 0x19
.zero 4
.LCPI0_171:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 25 # 0x19
.LCPI0_172:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 26 # 0x1a
.zero 4
.zero 4
.zero 4
.zero 4
.LCPI0_173:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 13 # 0xd
.zero 8
.LCPI0_174:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 26 # 0x1a
.zero 4
.zero 4
.LCPI0_175:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 6 # 0x6
.quad 13 # 0xd
.LCPI0_176:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 26 # 0x1a
.LCPI0_177:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 27 # 0x1b
.zero 4
.zero 4
.zero 4
.LCPI0_178:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 27 # 0x1b
.zero 4
.zero 4
.LCPI0_179:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 27 # 0x1b
.zero 4
.LCPI0_180:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 27 # 0x1b
.LCPI0_181:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 28 # 0x1c
.zero 4
.zero 4
.LCPI0_182:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 3 # 0x3
.quad 4 # 0x4
.quad 5 # 0x5
.quad 6 # 0x6
.quad 14 # 0xe
.LCPI0_183:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 28 # 0x1c
.LCPI0_184:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 29 # 0x1d
.zero 4
.LCPI0_185:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 29 # 0x1d
.LCPI0_186:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 7 # 0x7
.long 8 # 0x8
.long 9 # 0x9
.long 10 # 0xa
.long 11 # 0xb
.long 12 # 0xc
.long 13 # 0xd
.long 14 # 0xe
.long 30 # 0x1e
.LCPI0_193:
.zero 64
.section .rodata.cst4,"aM",@progbits,4
.p2align 2, 0x0
.LCPI0_1:
.long 14 # 0xe
.LCPI0_2:
.long 13 # 0xd
.LCPI0_3:
.long 12 # 0xc
.LCPI0_4:
.long 11 # 0xb
.LCPI0_5:
.long 10 # 0xa
.LCPI0_6:
.long 8 # 0x8
.LCPI0_7:
.long 6 # 0x6
.LCPI0_8:
.long 4 # 0x4
.LCPI0_9:
.long 2 # 0x2
.LCPI0_103:
.long 15 # 0xf
.LCPI0_104:
.long 9 # 0x9
.LCPI0_105:
.long 7 # 0x7
.LCPI0_106:
.long 5 # 0x5
.LCPI0_107:
.long 3 # 0x3
.LCPI0_108:
.long 1 # 0x1
.section .rodata.cst32,"aM",@progbits,32
.p2align 5, 0x0
.LCPI0_10:
.long 3 # 0x3
.long 11 # 0xb
.long 2 # 0x2
.long 3 # 0x3
.long 7 # 0x7
.long 15 # 0xf
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_14:
.long 3 # 0x3
.long 11 # 0xb
.long 2 # 0x2
.long 3 # 0x3
.long 7 # 0x7
.long 15 # 0xf
.zero 4
.zero 4
.LCPI0_24:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 10 # 0xa
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 14 # 0xe
.LCPI0_26:
.long 0 # 0x0
.long 1 # 0x1
.long 0 # 0x0
.long 8 # 0x8
.long 4 # 0x4
.long 5 # 0x5
.long 4 # 0x4
.long 12 # 0xc
.LCPI0_27:
.long 1 # 0x1
.long 9 # 0x9
.long 2 # 0x2
.long 3 # 0x3
.long 5 # 0x5
.long 13 # 0xd
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_31:
.long 1 # 0x1
.long 9 # 0x9
.long 2 # 0x2
.long 3 # 0x3
.long 5 # 0x5
.long 13 # 0xd
.zero 4
.zero 4
.LCPI0_36:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 25 # 0x19
.zero 4
.zero 4
.zero 4
.LCPI0_37:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 25 # 0x19
.zero 4
.zero 4
.LCPI0_38:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 25 # 0x19
.zero 4
.LCPI0_39:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 25 # 0x19
.LCPI0_43:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 13 # 0xd
.zero 8
.LCPI0_44:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 26 # 0x1a
.zero 4
.zero 4
.LCPI0_45:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 13 # 0xd
.LCPI0_46:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 26 # 0x1a
.LCPI0_51:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 27 # 0x1b
.zero 4
.zero 4
.zero 4
.LCPI0_52:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 27 # 0x1b
.zero 4
.zero 4
.LCPI0_53:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 27 # 0x1b
.zero 4
.LCPI0_54:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 27 # 0x1b
.LCPI0_60:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 14 # 0xe
.zero 8
.LCPI0_61:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 28 # 0x1c
.zero 4
.zero 4
.LCPI0_62:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 14 # 0xe
.LCPI0_63:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 28 # 0x1c
.LCPI0_69:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 29 # 0x1d
.zero 4
.zero 4
.zero 4
.LCPI0_70:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 29 # 0x1d
.zero 4
.zero 4
.LCPI0_71:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 29 # 0x1d
.zero 4
.LCPI0_72:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 29 # 0x1d
.LCPI0_80:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 15 # 0xf
.zero 8
.LCPI0_81:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 30 # 0x1e
.zero 4
.zero 4
.LCPI0_82:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 15 # 0xf
.LCPI0_83:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 30 # 0x1e
.LCPI0_92:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 31 # 0x1f
.zero 4
.zero 4
.zero 4
.LCPI0_93:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 31 # 0x1f
.zero 4
.zero 4
.LCPI0_94:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 31 # 0x1f
.zero 4
.LCPI0_95:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 31 # 0x1f
.LCPI0_118:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 6 # 0x6
.LCPI0_120:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 13 # 0xd
.zero 4
.zero 4
.zero 4
.LCPI0_121:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 13 # 0xd
.zero 4
.LCPI0_122:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 13 # 0xd
.LCPI0_131:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 7 # 0x7
.zero 8
.LCPI0_132:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 14 # 0xe
.zero 4
.zero 4
.LCPI0_133:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 14 # 0xe
.LCPI0_142:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 15 # 0xf
.zero 4
.zero 4
.zero 4
.LCPI0_143:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 15 # 0xf
.zero 4
.zero 4
.LCPI0_144:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 15 # 0xf
.zero 4
.LCPI0_155:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 12 # 0xc
.zero 8
.LCPI0_156:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 24 # 0x18
.zero 4
.zero 4
.LCPI0_157:
.quad 0 # 0x0
.quad 1 # 0x1
.quad 2 # 0x2
.quad 12 # 0xc
.LCPI0_158:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 3 # 0x3
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 24 # 0x18
.LCPI0_187:
.long 0 # 0x0
.long 1 # 0x1
.zero 4
.zero 4
.zero 4
.zero 4
.long 4 # 0x4
.long 12 # 0xc
.LCPI0_188:
.long 1 # 0x1
.long 9 # 0x9
.zero 4
.zero 4
.long 5 # 0x5
.long 13 # 0xd
.zero 4
.zero 4
.LCPI0_189:
.zero 4
.zero 4
.zero 4
.zero 4
.long 5 # 0x5
.long 13 # 0xd
.zero 4
.zero 4
.LCPI0_190:
.long 0 # 0x0
.long 1 # 0x1
.zero 4
.zero 4
.zero 4
.zero 4
.long 6 # 0x6
.long 14 # 0xe
.LCPI0_191:
.long 3 # 0x3
.long 11 # 0xb
.zero 4
.zero 4
.long 7 # 0x7
.long 15 # 0xf
.zero 4
.zero 4
.LCPI0_192:
.zero 4
.zero 4
.zero 4
.zero 4
.long 7 # 0x7
.long 15 # 0xf
.zero 4
.zero 4
.section .rodata.cst16,"aM",@progbits,16
.p2align 4, 0x0
.LCPI0_11:
.long 7 # 0x7
.long 23 # 0x17
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_12:
.long 7 # 0x7
.long 15 # 0xf
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_21:
.quad 0 # 0x0
.quad 8 # 0x8
.LCPI0_22:
.zero 4
.zero 4
.long 4 # 0x4
.long 0 # 0x0
.LCPI0_23:
.long 4 # 0x4
.long 5 # 0x5
.long 6 # 0x6
.long 22 # 0x16
.LCPI0_25:
.long 4 # 0x4
.long 5 # 0x5
.long 4 # 0x4
.long 20 # 0x14
.LCPI0_28:
.long 5 # 0x5
.long 21 # 0x15
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_29:
.long 5 # 0x5
.long 13 # 0xd
.long 6 # 0x6
.long 7 # 0x7
.LCPI0_33:
.long 3 # 0x3
.long 7 # 0x7
.zero 4
.zero 4
.LCPI0_34:
.long 8 # 0x8
.long 9 # 0x9
.long 25 # 0x19
.zero 4
.LCPI0_35:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 25 # 0x19
.LCPI0_41:
.quad 4 # 0x4
.quad 13 # 0xd
.LCPI0_42:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 26 # 0x1a
.LCPI0_49:
.long 8 # 0x8
.long 9 # 0x9
.long 27 # 0x1b
.zero 4
.LCPI0_50:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 27 # 0x1b
.LCPI0_58:
.quad 6 # 0x6
.quad 14 # 0xe
.LCPI0_59:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 28 # 0x1c
.LCPI0_67:
.long 12 # 0xc
.long 13 # 0xd
.long 29 # 0x1d
.zero 4
.LCPI0_68:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 29 # 0x1d
.LCPI0_78:
.quad 6 # 0x6
.quad 15 # 0xf
.LCPI0_79:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 30 # 0x1e
.LCPI0_90:
.long 12 # 0xc
.long 13 # 0xd
.long 31 # 0x1f
.zero 4
.LCPI0_91:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 31 # 0x1f
.LCPI0_153:
.quad 4 # 0x4
.quad 12 # 0xc
.LCPI0_154:
.long 0 # 0x0
.long 1 # 0x1
.long 2 # 0x2
.long 24 # 0x18
.LCPI0_195:
.long 3 # 0x3
.long 7 # 0x7
.long 0 # 0x0
.long 0 # 0x0
.LCPI0_196:
.long 0 # 0x0
.long 1 # 0x1
.long 4 # 0x4
.long 12 # 0xc
.LCPI0_197:
.long 0 # 0x0
.long 1 # 0x1
.long 6 # 0x6
.long 14 # 0xe
.section .rodata.cst8,"aM",@progbits,8
.p2align 3, 0x0
.LCPI0_13:
.long 7 # 0x7
.long 15 # 0xf
.LCPI0_30:
.long 5 # 0x5
.long 13 # 0xd
.LCPI0_32:
.long 3 # 0x3
.long 19 # 0x13
.LCPI0_194:
.long 4 # 0x4
.long 0 # 0x0
.text
.globl matmul_kernel
.p2align 4, 0x90
.type matmul_kernel,@function
matmul_kernel: # @matmul_kernel
.Lfunc_begin0:
.file 1 "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" "03-matrix-multiplication-cpu.py"
.loc 1 166 0 # 03-matrix-multiplication-cpu.py:166:0
.cfi_sections .debug_frame
.cfi_startproc
# %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $3448, %rsp # imm = 0xD78
.cfi_def_cfa_offset 3504
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
# kill: def $ecx killed $ecx def $rcx
.Ltmp0:
.file 2 "/data/users/minjang/triton-oss/triton-cpu/python/triton/language" "standard.py"
.loc 2 40 22 prologue_end # standard.py:40:22
leal 15(%rcx), %eax
movl 3528(%rsp), %ebp
.loc 2 40 28 is_stmt 0 # standard.py:40:28
leal 30(%rcx), %r11d
.Ltmp1:
# kill: def $r8d killed $r8d def $r8
.loc 2 40 28 # standard.py:40:28
leal 30(%r8), %r10d
movq %rdx, 496(%rsp) # 8-byte Spill
movl $8, %ebx
.Ltmp2:
# kill: def $r9d killed $r9d def $r9
.loc 2 40 28 # standard.py:40:28
testl %eax, %eax
cmovnsl %eax, %r11d
.Ltmp3:
.loc 2 40 22 # standard.py:40:22
leal 15(%r8), %eax
.Ltmp4:
.loc 2 40 28 # standard.py:40:28
sarl $4, %r11d
.Ltmp5:
.loc 2 40 28 # standard.py:40:28
testl %eax, %eax
cmovnsl %eax, %r10d
sarl $4, %r10d
.Ltmp6:
.loc 1 192 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:192:22
movl %ebp, %eax
cltd
.loc 1 191 38 # 03-matrix-multiplication-cpu.py:191:38
shll $3, %r10d
.loc 1 192 22 # 03-matrix-multiplication-cpu.py:192:22
idivl %r10d
movl %eax, %r14d
.loc 1 193 29 # 03-matrix-multiplication-cpu.py:193:29
leal (,%r14,8), %eax
.loc 1 194 35 # 03-matrix-multiplication-cpu.py:194:35
subl %eax, %r11d
.loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33
movl %ebp, %eax
cltd
.loc 1 194 48 # 03-matrix-multiplication-cpu.py:194:48
cmpl $8, %r11d
cmovll %r11d, %ebx
.loc 1 196 19 # 03-matrix-multiplication-cpu.py:196:19
imull %r14d, %r10d
.loc 1 195 33 # 03-matrix-multiplication-cpu.py:195:33
idivl %ebx
# kill: def $edx killed $edx def $rdx
.loc 1 195 27 is_stmt 0 # 03-matrix-multiplication-cpu.py:195:27
leal (%rdx,%r14,8), %r15d
.loc 1 196 19 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:19
subl %r10d, %ebp
.loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38
vpbroadcastd %r15d, %zmm0
.loc 1 205 23 is_stmt 0 # 03-matrix-multiplication-cpu.py:205:23
shll $4, %r15d
.loc 1 196 40 is_stmt 1 # 03-matrix-multiplication-cpu.py:196:40
movl %ebp, %eax
cltd
.loc 1 205 38 # 03-matrix-multiplication-cpu.py:205:38
vpslld $4, %zmm0, %zmm0
vpord .LCPI0_0(%rip), %zmm0, %zmm2
.loc 1 196 40 # 03-matrix-multiplication-cpu.py:196:40
idivl %ebx
.Ltmp7:
.loc 2 40 22 # standard.py:40:22
leal 15(%r9), %edx
.loc 2 40 28 is_stmt 0 # standard.py:40:28
leal 30(%r9), %ebx
movq %r8, 488(%rsp) # 8-byte Spill
movq %rcx, 480(%rsp) # 8-byte Spill
movl %r15d, -112(%rsp) # 4-byte Spill
vextracti32x4 $3, %zmm2, %xmm1
vextracti32x4 $2, %zmm2, %xmm3
vmovdqu64 %zmm2, 3248(%rsp) # 64-byte Spill
vmovdqa %xmm1, 1408(%rsp) # 16-byte Spill
vmovdqa %xmm3, 1392(%rsp) # 16-byte Spill
.Ltmp8:
.loc 1 206 23 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:23
shll $4, %eax
.Ltmp9:
.loc 2 40 28 # standard.py:40:28
testl %edx, %edx
cmovnsl %edx, %ebx
movl %eax, -108(%rsp) # 4-byte Spill
.Ltmp10:
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
cmpl $16, %edx
jl .LBB0_1
# %bb.2: # %.lr.ph
.loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68
cltd
.loc 1 0 0 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:0
sarl $4, %ebx
vxorpd %xmm30, %xmm30, %xmm30
vpxord %xmm28, %xmm28, %xmm28
vpxord %xmm29, %xmm29, %xmm29
vpxord %xmm22, %xmm22, %xmm22
.loc 1 206 68 # 03-matrix-multiplication-cpu.py:206:68
idivl %r8d
.loc 1 205 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:205:68
vpextrd $3, %xmm1, %eax
movl %edx, 96(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $2, %xmm1, %eax
vpinsrd $3, %edx, %xmm0, %xmm0
cltd
idivl %ecx
vpextrd $1, %xmm1, %eax
movl %edx, 288(%rsp) # 4-byte Spill
cltd
idivl %ecx
vmovd %xmm1, %eax
vextracti128 $1, %ymm2, %xmm1
movl %edx, 32(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $3, %xmm3, %eax
movl %edx, 224(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $2, %xmm3, %eax
movl %edx, 416(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $1, %xmm3, %eax
movl %edx, 688(%rsp) # 4-byte Spill
cltd
idivl %ecx
vmovd %xmm3, %eax
movl %edx, 624(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $3, %xmm1, %eax
movl %edx, 560(%rsp) # 4-byte Spill
cltd
idivl %ecx
vpextrd $2, %xmm1, %eax
movl %edx, %ebp
cltd
idivl %ecx
vpextrd $1, %xmm1, %eax
movl %edx, %r11d
cltd
idivl %ecx
vmovd %xmm1, %eax
movl %edx, %r10d
cltd
idivl %ecx
vpextrd $3, %xmm2, %eax
movl %edx, %r14d
cltd
idivl %ecx
vpextrd $2, %xmm2, %eax
movl %edx, %r8d
cltd
idivl %ecx
vpextrd $1, %xmm2, %eax
movl %edx, %r12d
cltd
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r12d, %xmm3
.loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68
idivl %ecx
movl %r15d, %eax
movl %edx, %r13d
cltd
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r13d, %xmm2
movl 3504(%rsp), %r13d
.loc 1 205 68 # 03-matrix-multiplication-cpu.py:205:68
idivl %ecx
movl 3512(%rsp), %eax
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %edx, %xmm1
.loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40
vpbroadcastd %eax, %zmm5
vpmulld .LCPI0_1(%rip){1to16}, %zmm5, %zmm9
vpmulld .LCPI0_2(%rip){1to16}, %zmm5, %zmm11
vpmulld .LCPI0_3(%rip){1to16}, %zmm5, %zmm12
vpmulld .LCPI0_4(%rip){1to16}, %zmm5, %zmm13
.loc 1 227 33 # 03-matrix-multiplication-cpu.py:227:33
shll $4, %eax
.loc 1 209 40 # 03-matrix-multiplication-cpu.py:209:40
vpmulld .LCPI0_5(%rip){1to16}, %zmm5, %zmm14
vpmulld .LCPI0_6(%rip){1to16}, %zmm5, %zmm15
vpmulld .LCPI0_8(%rip){1to16}, %zmm5, %zmm17
vpmulld .LCPI0_7(%rip){1to16}, %zmm5, %zmm16
vpmulld .LCPI0_9(%rip){1to16}, %zmm5, %zmm10
vpslld $4, %zmm5, %zmm4
movl %eax, 160(%rsp) # 4-byte Spill
movl 96(%rsp), %eax # 4-byte Reload
vpslld $3, %zmm5, %zmm6
vpslld $2, %zmm5, %zmm8
vpsubd %zmm5, %zmm4, %zmm4
vpaddd %zmm6, %zmm5, %zmm7
vpsubd %zmm5, %zmm6, %zmm6
vpaddd %zmm8, %zmm5, %zmm8
.loc 1 209 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:52
valignd $15, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm6, %zmm6, %zmm19 # zmm19 = zmm6[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm7, %zmm7, %zmm20 # zmm20 = zmm7[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm8, %zmm8, %zmm18 # zmm18 = zmm8[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm9, %zmm9, %zmm6 # zmm6 = zmm9[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm11, %zmm11, %zmm7 # zmm7 = zmm11[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm12, %zmm12, %zmm8 # zmm8 = zmm12[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
.loc 1 206 68 is_stmt 1 # 03-matrix-multiplication-cpu.py:206:68
vmovd %eax, %xmm12
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
valignd $15, %zmm13, %zmm13, %zmm13 # zmm13 = zmm13[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm14, %zmm14, %zmm14 # zmm14 = zmm14[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm17, %zmm17, %zmm21 # zmm21 = zmm17[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm16, %zmm16, %zmm17 # zmm17 = zmm16[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
valignd $15, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
cltq
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vpaddd %xmm4, %xmm12, %xmm9
movq %rax, -96(%rsp) # 8-byte Spill
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r8d, %xmm4
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -96(%rsp) # 8-byte Folded Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm9, %r15d
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %r15d, %rcx
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vpaddd %xmm6, %xmm12, %xmm9
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r14d, %xmm6
movq %rcx, 24(%rsp) # 8-byte Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vpaddd %xmm13, %xmm12, %xmm11
vpaddd %xmm14, %xmm12, %xmm13
vpaddd %xmm10, %xmm12, %xmm10
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, 24(%rsp) # 8-byte Folded Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm9, %r14d
vpaddd %xmm7, %xmm12, %xmm9
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r10d, %xmm7
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm9, %r10d
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %r14d, %rcx
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vpaddd %xmm8, %xmm12, %xmm9
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r11d, %xmm8
movq %rcx, -8(%rsp) # 8-byte Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm9, %ecx
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ebp, %xmm9
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22
movslq %r10d, %r10
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -8(%rsp) # 8-byte Folded Spill
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r10
movq %rcx, -16(%rsp) # 8-byte Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm11, %ecx
movq %r10, 552(%rsp) # 8-byte Spill
movl %ebx, %r10d
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -16(%rsp) # 8-byte Folded Spill
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %r11
movl 560(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r11
movq %r11, 544(%rsp) # 8-byte Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm11
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm20, %xmm12, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, -24(%rsp) # 8-byte Spill
movl 624(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -24(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm14
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm15, %xmm12, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, 16(%rsp) # 8-byte Spill
movl 688(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, 16(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm15
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm19, %xmm12, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, -40(%rsp) # 8-byte Spill
movl 416(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -40(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm16
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm17, %xmm12, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, 8(%rsp) # 8-byte Spill
movl 224(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, 8(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm17
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm18, %xmm12, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, -48(%rsp) # 8-byte Spill
movl 32(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -48(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm18
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm21, %xmm12, %xmm13
vpxord %xmm21, %xmm21, %xmm21
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
movq %rcx, (%rsp) # 8-byte Spill
movl 288(%rsp), %ecx # 4-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, (%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %ecx, %xmm19
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm13, %ecx
vpaddd %xmm5, %xmm5, %xmm13
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vpaddd %xmm5, %xmm13, %xmm13
vpaddd %xmm5, %xmm12, %xmm5
movq %rcx, -56(%rsp) # 8-byte Spill
movb $-64, %cl
vpaddd %xmm13, %xmm12, %xmm20
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41
vpbroadcastd %r13d, %xmm13
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -56(%rsp) # 8-byte Folded Spill
vpxor %xmm12, %xmm12, %xmm12
kmovd %ecx, %k4
movw $512, %cx # imm = 0x200
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpmulld %xmm13, %xmm0, %xmm0
vpmulld %xmm13, %xmm3, %xmm3
vpmulld %xmm13, %xmm2, %xmm2
kmovd %ecx, %k1
movb $32, %cl
kmovw %k1, -114(%rsp) # 2-byte Spill
kmovd %ecx, %k1
movw $2048, %cx # imm = 0x800
kmovw %k1, -116(%rsp) # 2-byte Spill
kmovd %ecx, %k1
movb $64, %cl
vpextrd $3, %xmm3, %ebp
vbroadcasti32x4 .LCPI0_11(%rip), %zmm3 # zmm3 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7]
# zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
kmovw %k1, -118(%rsp) # 2-byte Spill
kmovd %ecx, %k1
movw $8192, %cx # imm = 0x2000
kmovw %k1, -120(%rsp) # 2-byte Spill
kmovd %ecx, %k1
movb $-128, %cl
kmovw %k1, -122(%rsp) # 2-byte Spill
kmovd %ecx, %k1
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm20, %ecx
.loc 1 209 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %r13
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm10, %ecx
.loc 1 208 41 is_stmt 1 # 03-matrix-multiplication-cpu.py:208:41
vpmulld %xmm13, %xmm19, %xmm10
vpxord %xmm19, %xmm19, %xmm19
kmovw %k1, -124(%rsp) # 2-byte Spill
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rcx
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r13
movq %rcx, -64(%rsp) # 8-byte Spill
.loc 1 209 52 # 03-matrix-multiplication-cpu.py:209:52
vmovd %xmm5, %ecx
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpmulld %xmm13, %xmm18, %xmm5
movq %r13, 536(%rsp) # 8-byte Spill
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -64(%rsp) # 8-byte Folded Spill
vpxord %xmm18, %xmm18, %xmm18
.loc 1 209 22 # 03-matrix-multiplication-cpu.py:209:22
movslq %ecx, %rdx
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm0, %ecx
vbroadcasti32x4 .LCPI0_28(%rip), %zmm0 # zmm0 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7]
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
.loc 1 208 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm10, %ecx
vpmulld %xmm13, %xmm17, %xmm10
vmovdqu64 %zmm3, 2352(%rsp) # 64-byte Spill
vbroadcasti32x4 .LCPI0_23(%rip), %zmm3 # zmm3 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22]
# zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %rdx
movq %rax, -72(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %ecx
vpmulld %xmm13, %xmm16, %xmm5
movq %rdx, 528(%rsp) # 8-byte Spill
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -72(%rsp) # 8-byte Folded Spill
vpxord %xmm16, %xmm16, %xmm16
movq %rax, -88(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -88(%rsp) # 8-byte Folded Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm10, %ecx
vpmulld %xmm13, %xmm15, %xmm10
movq %rax, -80(%rsp) # 8-byte Spill
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -80(%rsp) # 8-byte Folded Spill
vpxor %xmm15, %xmm15, %xmm15
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %r15
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %ecx
vpmulld %xmm13, %xmm14, %xmm5
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %r12
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r15
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm10, %ecx
vpmulld %xmm13, %xmm11, %xmm10
movq %r15, 520(%rsp) # 8-byte Spill
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r12
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %ecx
vpmulld %xmm13, %xmm9, %xmm5
vmovdqu64 %zmm0, 2096(%rsp) # 64-byte Spill
vbroadcasti128 .LCPI0_29(%rip), %ymm0 # ymm0 = [5,13,6,7,5,13,6,7]
# ymm0 = mem[0,1,0,1]
movq %r12, 512(%rsp) # 8-byte Spill
vpxor %xmm9, %xmm9, %xmm9
movl $65535, %r12d # imm = 0xFFFF
movq %rax, -32(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -32(%rsp) # 8-byte Folded Spill
vmovdqu64 %zmm3, 2224(%rsp) # 64-byte Spill
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm10, %ecx
movq %rax, -104(%rsp) # 8-byte Spill
vpmulld %xmm13, %xmm8, %xmm10
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, -104(%rsp) # 8-byte Folded Spill
vpxor %xmm8, %xmm8, %xmm8
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %ecx
vpmulld %xmm13, %xmm7, %xmm5
vpmulld %xmm13, %xmm6, %xmm7
vpxor %xmm6, %xmm6, %xmm6
movq %rax, 96(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm10, %ecx
movq %rax, 32(%rsp) # 8-byte Spill
movq 96(%rsp), %rbx # 8-byte Reload
vpextrd $3, %xmm5, %r8d
vpmulld %xmm13, %xmm4, %xmm5
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ecx, %rax
vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7]
# ymm4 = mem[0,1,0,1]
movq %rax, 224(%rsp) # 8-byte Spill
movslq %r8d, %rax
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm7, %r8d
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %r8d, %r14
movq %rax, 416(%rsp) # 8-byte Spill
movq 224(%rsp), %r13 # 8-byte Reload
vmovdqu %ymm0, 784(%rsp) # 32-byte Spill
vpxor %xmm0, %xmm0, %xmm0
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %r8d
vpmulld %xmm13, %xmm1, %xmm5
movq 416(%rsp), %r11 # 8-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r14
vpxor %xmm1, %xmm1, %xmm1
vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %r8d, %rax
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %rbx
movslq 160(%rsp), %r8 # 4-byte Folded Reload
movq %rax, 288(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ebp, %rax
.loc 1 208 41 is_stmt 0 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm2, %ebp
vbroadcasti32x4 .LCPI0_21(%rip), %zmm2 # zmm2 = [0,8,0,8,0,8,0,8]
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
movq %rbx, %r15
movl %r10d, %ebx
movq %rax, 624(%rsp) # 8-byte Spill
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ebp, %rax
.loc 1 208 41 # 03-matrix-multiplication-cpu.py:208:41
vpextrd $3, %xmm5, %ebp
.loc 1 208 22 # 03-matrix-multiplication-cpu.py:208:22
movslq %ebp, %rcx
movq 624(%rsp), %rbp # 8-byte Reload
.loc 1 217 22 is_stmt 1 # 03-matrix-multiplication-cpu.py:217:22
shlq $2, %r13
shlq $2, %rax
movq %rcx, 688(%rsp) # 8-byte Spill
movq 32(%rsp), %rcx # 8-byte Reload
shlq $2, %r11
vmovdqu %ymm4, 752(%rsp) # 32-byte Spill
vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill
movq 688(%rsp), %r10 # 8-byte Reload
shlq $2, %r8
movq %r8, 504(%rsp) # 8-byte Spill
movq %rax, %r8
shlq $2, %rbp
vmovdqu64 %zmm2, 2288(%rsp) # 64-byte Spill
vbroadcasti32x4 .LCPI0_25(%rip), %zmm2 # zmm2 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20]
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
shlq $2, %rcx
movq %rcx, %rdx
movq 288(%rsp), %rcx # 8-byte Reload
shlq $2, %r10
shlq $2, %rcx
vmovdqu64 %zmm2, 2160(%rsp) # 64-byte Spill
.loc 1 0 22 is_stmt 0 # :0:22
.Ltmp11:
.p2align 4, 0x90
.LBB0_3: # =>This Inner Loop Header: Depth=1
vmovdqa64 %zmm29, %zmm26
vpunpckldq %ymm19, %ymm26, %ymm0 # ymm0 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5]
vmovups 2352(%rsp), %zmm7 # 64-byte Reload
vmovdqa64 %ymm26, %ymm2
vmovdqa64 %zmm9, %zmm27
vmovapd %ymm30, %ymm23
vmovdqa64 %zmm6, %zmm17
vmovdqa64 %zmm21, %zmm24
vmovdqa64 %zmm15, %zmm31
vpunpckhdq %ymm31, %ymm24, %ymm15 # ymm15 = ymm24[2],ymm31[2],ymm24[3],ymm31[3],ymm24[6],ymm31[6],ymm24[7],ymm31[7]
vpunpckhdq %ymm18, %ymm22, %ymm20 # ymm20 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7]
vpunpckhdq %ymm19, %ymm26, %ymm3 # ymm3 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7]
vmovdqa64 %zmm29, %zmm4
vmovapd %zmm30, %zmm29
vpunpckldq %ymm27, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm27[0],ymm12[1],ymm27[1],ymm12[4],ymm27[4],ymm12[5],ymm27[5]
vmovdqa64 %zmm12, %zmm25
vmovdqa64 %ymm26, %ymm5
vmovdqa64 %zmm12, %zmm11
vmovdqu64 %zmm1, 160(%rsp) # 64-byte Spill
vmovdqa64 %zmm12, %zmm13
.loc 1 221 51 is_stmt 1 # 03-matrix-multiplication-cpu.py:221:51
testl %r9d, %r9d
.loc 1 221 20 is_stmt 0 # 03-matrix-multiplication-cpu.py:221:20
movl $0, %eax
vmovdqu %ymm0, 96(%rsp) # 32-byte Spill
vmovaps .LCPI0_10(%rip), %ymm0 # ymm0 = [3,11,2,3,7,15,6,7]
cmovgl %r12d, %eax
kmovd %eax, %k2
movq -104(%rsp), %rax # 8-byte Reload
vpermt2ps %ymm19, %ymm0, %ymm2
vpunpckhdq %zmm9, %zmm12, %zmm0 # zmm0 = zmm12[2],zmm9[2],zmm12[3],zmm9[3],zmm12[6],zmm9[6],zmm12[7],zmm9[7],zmm12[10],zmm9[10],zmm12[11],zmm9[11],zmm12[14],zmm9[14],zmm12[15],zmm9[15]
vpunpckhdq %ymm27, %ymm12, %ymm9 # ymm9 = ymm12[2],ymm27[2],ymm12[3],ymm27[3],ymm12[6],ymm27[6],ymm12[7],ymm27[7]
vshuff64x2 $85, %zmm15, %zmm15, %zmm15 # zmm15 = zmm15[2,3,2,3,2,3,2,3]
vinserti64x4 $1, %ymm20, %zmm0, %zmm20
vmovups %zmm0, 352(%rsp) # 64-byte Spill
vinsertf64x4 $1, %ymm2, %zmm0, %zmm14
vmovdqa64 %zmm12, %zmm2
vpermt2ps %zmm27, %zmm7, %zmm2
vmovups 752(%rsp), %ymm7 # 32-byte Reload
vmovapd %zmm30, %zmm0
vshuff64x2 $85, %zmm9, %zmm9, %zmm9 # zmm9 = zmm9[2,3,2,3,2,3,2,3]
vmovups 224(%rsp), %zmm30 # 64-byte Reload
vshuff64x2 $85, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm15, %zmm2, %zmm2 # zmm2 = zmm2[0],zmm15[0],zmm2[2],zmm15[2],zmm2[4],zmm15[5],zmm2[6],zmm15[6]
vmovups 2224(%rsp), %zmm15 # 64-byte Reload
vshufpd $128, %zmm20, %zmm14, %zmm2 {%k4} # zmm2 {%k4} = zmm14[0],zmm20[0],zmm14[2],zmm20[2],zmm14[4],zmm20[4],zmm14[6],zmm20[7]
vmovdqa %ymm1, %ymm14
vpermt2ps %ymm6, %ymm7, %ymm23
vmovaps .LCPI0_14(%rip), %ymm6 # ymm6 = [3,11,2,3,7,15,u,u]
vextractf32x4 $1, %ymm23, %xmm7
vpermt2ps %ymm8, %ymm6, %ymm14
vpunpckhdq %ymm16, %ymm28, %ymm6 # ymm6 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7]
vblendps $192, %ymm6, %ymm14, %ymm6 # ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7]
vmovdqa64 %zmm21, %zmm14
vmovaps %zmm8, %zmm21
vmovaps .LCPI0_24(%rip), %ymm8 # ymm8 = [0,1,2,10,4,5,6,14]
vpermt2ps %zmm31, %zmm15, %zmm14
vmovdqa64 %zmm26, %zmm15
vpunpckldq %xmm27, %xmm12, %xmm26 # xmm26 = xmm12[0],xmm27[0],xmm12[1],xmm27[1]
vshufpd $32, %zmm14, %zmm9, %zmm20 # zmm20 = zmm9[0],zmm14[0],zmm9[2],zmm14[2],zmm9[4],zmm14[5],zmm9[6],zmm14[6]
vmovdqa64 %ymm22, %ymm9
vinsertf64x4 $1, %ymm3, %zmm0, %zmm14
vbroadcastsd .LCPI0_13(%rip), %ymm3 # ymm3 = [7,15,7,15,7,15,7,15]
vpermt2ps %ymm18, %ymm8, %ymm9
vmovaps .LCPI0_15(%rip), %zmm8 # zmm8 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15]
vinsertf64x4 $1, %ymm9, %zmm0, %zmm9
vshufpd $128, %zmm9, %zmm14, %zmm20 {%k4} # zmm20 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[7]
vmovups 32(%rsp), %zmm14 # 64-byte Reload
vmovupd %zmm20, 288(%rsp) # 64-byte Spill
vpermt2ps %zmm27, %zmm8, %zmm25
vmovaps %ymm14, %ymm9
vpermt2ps %ymm30, %ymm3, %ymm9
vmovups 2160(%rsp), %zmm3 # 64-byte Reload
vunpckhps %zmm30, %zmm14, %zmm23 # zmm23 = zmm14[2],zmm30[2],zmm14[3],zmm30[3],zmm14[6],zmm30[6],zmm14[7],zmm30[7],zmm14[10],zmm30[10],zmm14[11],zmm30[11],zmm14[14],zmm30[14],zmm14[15],zmm30[15]
vmovups %zmm23, 1584(%rsp) # 64-byte Spill
vblendps $3, %xmm7, %xmm9, %xmm7 # xmm7 = xmm7[0,1],xmm9[2,3]
vmovdqa64 %zmm24, %zmm9
vblendps $15, %ymm7, %ymm6, %ymm6 # ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
vpunpckldq %ymm31, %ymm24, %ymm7 # ymm7 = ymm24[0],ymm31[0],ymm24[1],ymm31[1],ymm24[4],ymm31[4],ymm24[5],ymm31[5]
vinsertf64x4 $0, %ymm6, %zmm2, %zmm2
vpermt2ps %zmm31, %zmm3, %zmm9
vmovaps .LCPI0_26(%rip), %ymm3 # ymm3 = [0,1,0,8,4,5,4,12]
vmovupd %zmm2, 1072(%rsp) # 64-byte Spill
vmovdqa64 %zmm24, %zmm2
vshufpd $32, %zmm9, %zmm10, %zmm20 # zmm20 = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[5],zmm10[6],zmm9[6]
vinsertf64x4 $1, 96(%rsp), %zmm0, %zmm10 # 32-byte Folded Reload
vmovdqa64 %ymm22, %ymm9
vpermt2ps %ymm18, %ymm3, %ymm9
vinsertf64x4 $1, %ymm9, %zmm0, %zmm9
vpermt2ps %zmm17, %zmm8, %zmm0
vshufpd $128, %zmm9, %zmm10, %zmm20 {%k4} # zmm20 {%k4} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[7]
vpunpckhdq %zmm31, %zmm24, %zmm9 # zmm9 = zmm24[2],zmm31[2],zmm24[3],zmm31[3],zmm24[6],zmm31[6],zmm24[7],zmm31[7],zmm24[10],zmm31[10],zmm24[11],zmm31[11],zmm24[14],zmm31[14],zmm24[15],zmm31[15]
vshufpd $32, %zmm9, %zmm25, %zmm3 # zmm3 = zmm25[0],zmm9[0],zmm25[2],zmm9[2],zmm25[4],zmm9[5],zmm25[6],zmm9[6]
vmovupd %zmm20, 560(%rsp) # 64-byte Spill
vmovdqa64 %zmm15, %zmm20
vmovupd %zmm3, 96(%rsp) # 64-byte Spill
vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14]
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3]
vmovups %zmm0, 1776(%rsp) # 64-byte Spill
vpermt2pd %zmm9, %zmm3, %zmm25
vmovapd %zmm3, %zmm10
vmovaps .LCPI0_27(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,6,7]
vextractf32x4 $3, %zmm0, %xmm9
vmovupd %zmm25, 416(%rsp) # 64-byte Spill
vmovaps %zmm8, %zmm25
vpermt2ps %zmm19, %zmm25, %zmm20
vunpcklps %zmm17, %zmm29, %zmm25 # zmm25 = zmm29[0],zmm17[0],zmm29[1],zmm17[1],zmm29[4],zmm17[4],zmm29[5],zmm17[5],zmm29[8],zmm17[8],zmm29[9],zmm17[9],zmm29[12],zmm17[12],zmm29[13],zmm17[13]
vpermt2ps %ymm19, %ymm3, %ymm5
vmovups 2096(%rsp), %zmm3 # 64-byte Reload
vinsertf64x4 $1, %ymm5, %zmm0, %zmm5
vpermt2ps %zmm27, %zmm3, %zmm11
vshufpd $32, %zmm7, %zmm11, %zmm3 # zmm3 = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[5],zmm11[6],zmm7[6]
vpunpckldq %ymm18, %ymm22, %ymm7 # ymm7 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5]
vpunpckldq %xmm19, %xmm15, %xmm11 # xmm11 = xmm15[0],xmm19[0],xmm15[1],xmm19[1]
vinsertf64x4 $1, %ymm7, %zmm0, %zmm7
vmovups 160(%rsp), %zmm0 # 64-byte Reload
vshufpd $128, %zmm7, %zmm5, %zmm3 {%k4} # zmm3 {%k4} = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[7]
vmovdqa64 %zmm24, %zmm7
vextractf32x4 $3, %zmm23, %xmm5
vmovdqa64 %zmm12, %zmm23
vmovupd %zmm3, 1328(%rsp) # 64-byte Spill
vbroadcastsd .LCPI0_32(%rip), %zmm3 # zmm3 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19]
vpermt2ps %zmm21, %zmm8, %zmm0
vblendps $3, %xmm9, %xmm5, %xmm8 # xmm8 = xmm9[0,1],xmm5[2,3]
vpermt2ps %zmm27, %zmm3, %zmm13
vpermt2ps %zmm31, %zmm3, %zmm7
vpermt2ps %zmm19, %zmm3, %zmm4
vmovups %zmm0, 160(%rsp) # 64-byte Spill
vextractf64x4 $1, %zmm0, %ymm9
vpunpckhdq %zmm16, %zmm28, %zmm0 # zmm0 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15]
vmovdqu64 %zmm0, 3184(%rsp) # 64-byte Spill
vshufpd $32, %zmm7, %zmm13, %zmm5 # zmm5 = zmm13[0],zmm7[0],zmm13[2],zmm7[2],zmm13[4],zmm7[5],zmm13[6],zmm7[6]
vmovdqa64 %zmm22, %zmm7
vpermt2ps %zmm18, %zmm3, %zmm7
vmovaps .LCPI0_17(%rip), %zmm3 # zmm3 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30]
vshufpd $128, %zmm7, %zmm4, %zmm5 {%k4} # zmm5 {%k4} = zmm4[0],zmm7[0],zmm4[2],zmm7[2],zmm4[4],zmm7[4],zmm4[6],zmm7[7]
vextracti64x4 $1, %zmm0, %ymm4
vmovupd 352(%rsp), %zmm0 # 64-byte Reload
vmovapd %zmm10, %zmm7
vmovupd %zmm5, 688(%rsp) # 64-byte Spill
vblendps $192, %ymm4, %ymm9, %ymm4 # ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7]
vpunpckldq %zmm27, %zmm12, %zmm9 # zmm9 = zmm12[0],zmm27[0],zmm12[1],zmm27[1],zmm12[4],zmm27[4],zmm12[5],zmm27[5],zmm12[8],zmm27[8],zmm12[9],zmm27[9],zmm12[12],zmm27[12],zmm12[13],zmm27[13]
vpermt2ps %zmm31, %zmm3, %zmm2
vmovaps %zmm3, %zmm6
vshufpd $32, %zmm2, %zmm0, %zmm5 # zmm5 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[5],zmm0[6],zmm2[6]
vpermt2pd %zmm2, %zmm10, %zmm0
vunpckhps %zmm17, %zmm29, %zmm2 # zmm2 = zmm29[2],zmm17[2],zmm29[3],zmm17[3],zmm29[6],zmm17[6],zmm29[7],zmm17[7],zmm29[10],zmm17[10],zmm29[11],zmm17[11],zmm29[14],zmm17[14],zmm29[15],zmm17[15]
vmovupd %zmm5, 624(%rsp) # 64-byte Spill
vmovaps %zmm14, %zmm5
vpermt2ps %zmm30, %zmm3, %zmm5
vmovupd %zmm2, 3056(%rsp) # 64-byte Spill
vextractf32x4 $3, %zmm2, %xmm2
vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15]
vmovupd %zmm0, 1840(%rsp) # 64-byte Spill
vextractf32x4 $3, %zmm5, %xmm0
vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3]
vmovups %ymm0, 3120(%rsp) # 32-byte Spill
vpunpckhdq %xmm27, %xmm12, %xmm0 # xmm0 = xmm12[2],xmm27[2],xmm12[3],xmm27[3]
vpermt2ps %zmm27, %zmm3, %zmm23
vmovdqu64 %zmm0, 1136(%rsp) # 64-byte Spill
vinsertps $76, %xmm12, %xmm27, %xmm0 # xmm0 = xmm12[1],xmm27[1],zero,zero
vmovdqa64 %zmm15, %zmm27
vpermt2ps %zmm19, %zmm3, %zmm27
vmovdqa64 %zmm9, %zmm12
vmovups %zmm0, 1200(%rsp) # 64-byte Spill
vblendps $15, %ymm8, %ymm4, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7]
vpunpckhdq %zmm21, %zmm1, %zmm4 # zmm4 = zmm1[2],zmm21[2],zmm1[3],zmm21[3],zmm1[6],zmm21[6],zmm1[7],zmm21[7],zmm1[10],zmm21[10],zmm1[11],zmm21[11],zmm1[14],zmm21[14],zmm1[15],zmm21[15]
vmovaps %zmm6, %zmm8
vmovups %ymm0, 880(%rsp) # 32-byte Spill
vmovdqa64 %zmm28, %zmm0
vpermt2ps %zmm16, %zmm6, %zmm0
vmovdqu64 %zmm4, 2736(%rsp) # 64-byte Spill
vextracti64x4 $1, %zmm4, %ymm4
vextractf64x4 $1, %zmm0, %ymm2
vpblendd $192, %ymm2, %ymm4, %ymm2 # ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
vmovdqu %ymm2, 2800(%rsp) # 32-byte Spill
vpunpckldq %zmm31, %zmm24, %zmm2 # zmm2 = zmm24[0],zmm31[0],zmm24[1],zmm31[1],zmm24[4],zmm31[4],zmm24[5],zmm31[5],zmm24[8],zmm31[8],zmm24[9],zmm31[9],zmm24[12],zmm31[12],zmm24[13],zmm31[13]
vshufpd $32, %zmm2, %zmm23, %zmm4 # zmm4 = zmm23[0],zmm2[0],zmm23[2],zmm2[2],zmm23[4],zmm2[5],zmm23[6],zmm2[6]
vpermt2pd %zmm2, %zmm10, %zmm23
vpunpckhdq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[2],zmm19[2],zmm15[3],zmm19[3],zmm15[6],zmm19[6],zmm15[7],zmm19[7],zmm15[10],zmm19[10],zmm15[11],zmm19[11],zmm15[14],zmm19[14],zmm15[15],zmm19[15]
vunpcklps %zmm30, %zmm14, %zmm10 # zmm10 = zmm14[0],zmm30[0],zmm14[1],zmm30[1],zmm14[4],zmm30[4],zmm14[5],zmm30[5],zmm14[8],zmm30[8],zmm14[9],zmm30[9],zmm14[12],zmm30[12],zmm14[13],zmm30[13]
vmovdqu64 %zmm2, 2928(%rsp) # 64-byte Spill
vpunpckldq %zmm19, %zmm15, %zmm2 # zmm2 = zmm15[0],zmm19[0],zmm15[1],zmm19[1],zmm15[4],zmm19[4],zmm15[5],zmm19[5],zmm15[8],zmm19[8],zmm15[9],zmm19[9],zmm15[12],zmm19[12],zmm15[13],zmm19[13]
vmovupd %zmm4, 1264(%rsp) # 64-byte Spill
vmovdqu64 %zmm2, 816(%rsp) # 64-byte Spill
vpunpckhdq %xmm19, %xmm15, %xmm2 # xmm2 = xmm15[2],xmm19[2],xmm15[3],xmm19[3]
vmovdqu64 %zmm2, 1968(%rsp) # 64-byte Spill
vinsertps $76, %xmm15, %xmm19, %xmm2 # xmm2 = xmm15[1],xmm19[1],zero,zero
vmovapd %zmm29, %zmm19
vpermt2ps %zmm17, %zmm3, %zmm19
vmovups %zmm2, 1008(%rsp) # 64-byte Spill
vextractf32x4 $3, %zmm10, %xmm2
vextractf32x4 $2, %zmm10, %xmm10
vextractf32x4 $3, %zmm19, %xmm4
vblendps $3, %xmm4, %xmm2, %xmm4 # xmm4 = xmm4[0,1],xmm2[2,3]
vmovdqa64 %zmm1, %zmm2
vpermt2ps %zmm21, %zmm3, %zmm2
vpunpckldq %zmm16, %zmm28, %zmm3 # zmm3 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13]
vmovdqu64 %zmm3, 2544(%rsp) # 64-byte Spill
vextracti64x4 $1, %zmm3, %ymm3
vextractf64x4 $1, %zmm2, %ymm15
vblendps $192, %ymm3, %ymm15, %ymm3 # ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
vextractf32x4 $3, %zmm25, %xmm15
vblendps $15, %ymm4, %ymm3, %ymm3 # ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
vpunpckldq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[0],xmm31[0],xmm24[1],xmm31[1]
vmovdqu64 %zmm4, 2608(%rsp) # 64-byte Spill
vpunpckhdq %xmm31, %xmm24, %xmm4 # xmm4 = xmm24[2],xmm31[2],xmm24[3],xmm31[3]
vmovups %ymm3, 1712(%rsp) # 32-byte Spill
vmovdqa64 %zmm24, %zmm3
vmovdqu64 %zmm4, 1648(%rsp) # 64-byte Spill
vinsertps $76, %xmm24, %xmm31, %xmm4 # xmm4 = xmm24[1],xmm31[1],zero,zero
vmovaps %zmm14, %zmm24
vmovups %zmm4, 944(%rsp) # 64-byte Spill
vmovaps .LCPI0_19(%rip), %zmm4 # zmm4 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28]
vpermt2ps %zmm31, %zmm4, %zmm3
vpermt2ps %zmm30, %zmm4, %zmm24
vmovaps %zmm14, %zmm31
vpunpcklqdq %ymm28, %ymm16, %ymm14 # ymm14 = ymm16[0],ymm28[0],ymm16[2],ymm28[2]
vmovdqu %ymm14, 2672(%rsp) # 32-byte Spill
vpunpckldq %ymm16, %ymm28, %ymm14 # ymm14 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5]
vmovdqu %ymm14, 2864(%rsp) # 32-byte Spill
vinsertps $179, %xmm16, %xmm28, %xmm14 # xmm14 = zero,zero,xmm28[2],xmm16[2]
vmovaps %xmm14, 2992(%rsp) # 16-byte Spill
vunpckhps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[2],xmm16[2],xmm28[3],xmm16[3]
vmovaps %xmm14, 1520(%rsp) # 16-byte Spill
vunpcklps %xmm16, %xmm28, %xmm14 # xmm14 = xmm28[0],xmm16[0],xmm28[1],xmm16[1]
vshufpd $32, %zmm3, %zmm9, %zmm6 # zmm6 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[5],zmm9[6],zmm3[6]
vpermt2pd %zmm3, %zmm7, %zmm12
vmovdqa64 %zmm22, %zmm7
vpunpckldq %zmm18, %zmm22, %zmm9 # zmm9 = zmm22[0],zmm18[0],zmm22[1],zmm18[1],zmm22[4],zmm18[4],zmm22[5],zmm18[5],zmm22[8],zmm18[8],zmm22[9],zmm18[9],zmm22[12],zmm18[12],zmm22[13],zmm18[13]
vpunpckhdq %zmm18, %zmm22, %zmm3 # zmm3 = zmm22[2],zmm18[2],zmm22[3],zmm18[3],zmm22[6],zmm18[6],zmm22[7],zmm18[7],zmm22[10],zmm18[10],zmm22[11],zmm18[11],zmm22[14],zmm18[14],zmm22[15],zmm18[15]
vpermt2ps %zmm18, %zmm4, %zmm7
vextractf32x4 $3, %zmm24, %xmm13
vmovaps %xmm14, 1456(%rsp) # 16-byte Spill
vmovupd %zmm6, 352(%rsp) # 64-byte Spill
vpunpckldq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[0],xmm18[0],xmm22[1],xmm18[1]
vshufpd $128, %zmm9, %zmm27, %zmm23 {%k4} # zmm23 {%k4} = zmm27[0],zmm9[0],zmm27[2],zmm9[2],zmm27[4],zmm9[4],zmm27[6],zmm9[7]
vmovdqu64 %zmm6, 2480(%rsp) # 64-byte Spill
vpunpckhdq %xmm18, %xmm22, %xmm6 # xmm6 = xmm22[2],xmm18[2],xmm22[3],xmm18[3]
vblendps $3, %xmm15, %xmm13, %xmm13 # xmm13 = xmm15[0,1],xmm13[2,3]
vmovdqu64 %zmm6, 2032(%rsp) # 64-byte Spill
vinsertps $76, %xmm22, %xmm18, %xmm6 # xmm6 = xmm22[1],xmm18[1],zero,zero
vpermt2ps %zmm18, %zmm8, %zmm22
vmovddup .LCPI0_194(%rip), %xmm8 # xmm8 = [4,0,4,0]
# xmm8 = mem[0,0]
vunpckhpd %ymm28, %ymm16, %ymm18 # ymm18 = ymm16[1],ymm28[1],ymm16[3],ymm28[3]
vmovups %zmm6, 1904(%rsp) # 64-byte Spill
vmovaps %zmm4, %zmm6
vmovaps %xmm16, %xmm4
vpermt2ps %xmm28, %xmm8, %xmm4
vmovaps %zmm28, %zmm8
vpermt2ps %zmm16, %zmm6, %zmm8
vpunpckldq %zmm21, %zmm1, %zmm28 # zmm28 = zmm1[0],zmm21[0],zmm1[1],zmm21[1],zmm1[4],zmm21[4],zmm1[5],zmm21[5],zmm1[8],zmm21[8],zmm1[9],zmm21[9],zmm1[12],zmm21[12],zmm1[13],zmm21[13]
vmovupd 2288(%rsp), %zmm16 # 64-byte Reload
vextracti64x4 $1, %zmm28, %ymm14
vextractf64x4 $1, %zmm8, %ymm15
vpblendd $192, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
vpblendd $15, %ymm13, %ymm14, %ymm6 # ymm6 = ymm13[0,1,2,3],ymm14[4,5,6,7]
vpermt2pd 2608(%rsp), %zmm16, %zmm26 # 64-byte Folded Reload
vpermt2pd 2480(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload
vmovdqu %ymm6, 1424(%rsp) # 32-byte Spill
vmovupd 1584(%rsp), %zmm6 # 64-byte Reload
vextractf32x4 $2, %zmm6, %xmm13
vmovupd 1776(%rsp), %zmm6 # 64-byte Reload
vmovapd %zmm11, %zmm26 {%k4}
vunpckhps %xmm30, %xmm31, %xmm11 # xmm11 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
vextractf32x4 $2, %zmm6, %xmm14
vmovupd 160(%rsp), %zmm6 # 64-byte Reload
vblendpd $1, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0],xmm13[1]
vshuff64x2 $170, %zmm6, %zmm6, %zmm14 # zmm14 = zmm6[4,5,4,5,4,5,4,5]
vmovupd 3184(%rsp), %zmm6 # 64-byte Reload
vshuff64x2 $170, %zmm6, %zmm6, %zmm15 # zmm15 = zmm6[4,5,4,5,4,5,4,5]
vshuff64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5]
vextractf32x4 $2, %zmm24, %xmm6
vmovupd 816(%rsp), %zmm24 # 64-byte Reload
vblendpd $8, %ymm15, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm15[3]
vblendpd $3, %ymm13, %ymm14, %ymm15 # ymm15 = ymm13[0,1],ymm14[2,3]
vextractf32x4 $2, %zmm5, %xmm13
vmovups 3056(%rsp), %zmm5 # 64-byte Reload
vshufpd $128, %zmm7, %zmm24, %zmm12 {%k4} # zmm12 {%k4} = zmm24[0],zmm7[0],zmm24[2],zmm7[2],zmm24[4],zmm7[4],zmm24[6],zmm7[7]
vextractf32x4 $2, %zmm5, %xmm14
vmovupd 2736(%rsp), %zmm5 # 64-byte Reload
vblendps $3, %xmm14, %xmm13, %xmm13 # xmm13 = xmm14[0,1],xmm13[2,3]
vshuff64x2 $170, %zmm5, %zmm5, %zmm14 # zmm14 = zmm5[4,5,4,5,4,5,4,5]
vunpcklps %xmm17, %xmm29, %xmm5 # xmm5 = xmm29[0],xmm17[0],xmm29[1],xmm17[1]
vblendpd $8, %ymm0, %ymm14, %ymm14 # ymm14 = ymm14[0,1,2],ymm0[3]
vmovupd 2800(%rsp), %ymm0 # 32-byte Reload
vblendpd $3, %ymm13, %ymm14, %ymm13 # ymm13 = ymm13[0,1],ymm14[2,3]
vextractf32x4 $2, %zmm19, %xmm14
vmovupd 416(%rsp), %zmm19 # 64-byte Reload
vblendpd $3, 3120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
# ymm0 = mem[0,1],ymm0[2,3]
vblendps $3, %xmm14, %xmm10, %xmm10 # xmm10 = xmm14[0,1],xmm10[2,3]
vmovapd .LCPI0_20(%rip), %zmm14 # zmm14 = [0,8,0,8,4,12,4,13]
vshufpd $128, %zmm3, %zmm20, %zmm19 {%k4} # zmm19 {%k4} = zmm20[0],zmm3[0],zmm20[2],zmm3[2],zmm20[4],zmm3[4],zmm20[6],zmm3[7]
vshuff64x2 $170, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[4,5,4,5,4,5,4,5]
vpermt2pd %zmm3, %zmm14, %zmm20
vmovupd 2544(%rsp), %zmm3 # 64-byte Reload
vpermt2pd %zmm9, %zmm14, %zmm27
vpermt2pd %zmm7, %zmm14, %zmm24
vmovups 784(%rsp), %ymm7 # 32-byte Reload
vmovaps %ymm31, %ymm9
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm3[3]
vextractf32x4 $2, %zmm25, %xmm3
vmovupd 2928(%rsp), %zmm25 # 64-byte Reload
vblendpd $3, %ymm10, %ymm2, %ymm2 # ymm2 = ymm10[0,1],ymm2[2,3]
vmovupd 1840(%rsp), %zmm10 # 64-byte Reload
vblendps $3, %xmm3, %xmm6, %xmm3 # xmm3 = xmm3[0,1],xmm6[2,3]
vshuff64x2 $170, %zmm8, %zmm8, %zmm6 # zmm6 = zmm8[4,5,4,5,4,5,4,5]
vshuff64x2 $170, %zmm28, %zmm28, %zmm8 # zmm8 = zmm28[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0,1,2],ymm6[3]
vinsertf128 $1, %xmm4, %ymm0, %ymm8
vblendpd $3, %ymm3, %ymm6, %ymm3 # ymm3 = ymm3[0,1],ymm6[2,3]
vmovlhps %xmm31, %xmm30, %xmm6 # xmm6 = xmm30[0],xmm31[0]
vshufps $36, %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,0]
vpunpckldq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[0],xmm21[0],xmm1[1],xmm21[1]
vinsertf128 $1, %xmm6, %ymm0, %ymm6
vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3]
vshufpd $128, %zmm22, %zmm25, %zmm10 {%k4} # zmm10 {%k4} = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[7]
vpermt2pd %zmm22, %zmm14, %zmm25
vbroadcastsd .LCPI0_30(%rip), %ymm14 # ymm14 = [5,13,5,13,5,13,5,13]
vshufps $51, %xmm29, %xmm17, %xmm8 # xmm8 = xmm17[3,0],xmm29[3,0]
vblendpd $3, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1],ymm6[2,3]
vunpckhps %ymm30, %ymm31, %ymm6 # ymm6 = ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[6],ymm30[6],ymm31[7],ymm30[7]
vinsertf64x4 $0, %ymm0, %zmm10, %zmm0
vinsertps $179, %xmm30, %xmm31, %xmm10 # xmm10 = zero,zero,xmm31[2],xmm30[2]
vinsertf64x4 $0, %ymm5, %zmm26, %zmm4
vunpckhps %ymm17, %ymm29, %ymm5 # ymm5 = ymm29[2],ymm17[2],ymm29[3],ymm17[3],ymm29[6],ymm17[6],ymm29[7],ymm17[7]
vpermpd $170, %ymm6, %ymm6 # ymm6 = ymm6[2,2,2,2]
vextractf128 $1, %ymm5, %xmm5
vmovupd %zmm4, 416(%rsp) # 64-byte Spill
vblendps $3, %xmm5, %xmm6, %xmm5 # xmm5 = xmm5[0,1],xmm6[2,3]
vpunpckhdq %ymm21, %ymm1, %ymm6 # ymm6 = ymm1[2],ymm21[2],ymm1[3],ymm21[3],ymm1[6],ymm21[6],ymm1[7],ymm21[7]
vshufps $36, %ymm18, %ymm6, %ymm6 # ymm6 = ymm6[0,1],ymm18[2,0],ymm6[4,5],ymm18[6,4]
vblendps $15, %ymm5, %ymm6, %ymm4 # ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7]
vmovups 288(%rsp), %zmm5 # 64-byte Reload
vunpckhps %xmm17, %xmm29, %xmm6 # xmm6 = xmm29[2],xmm17[2],xmm29[3],xmm17[3]
vpermt2ps %ymm30, %ymm14, %ymm9
vmovaps .LCPI0_31(%rip), %ymm14 # ymm14 = [1,9,2,3,5,13,u,u]
vinsertf64x4 $0, %ymm4, %zmm5, %zmm26
vunpcklps %ymm17, %ymm29, %ymm4 # ymm4 = ymm29[0],ymm17[0],ymm29[1],ymm17[1],ymm29[4],ymm17[4],ymm29[5],ymm17[5]
vunpcklps %ymm30, %ymm31, %ymm5 # ymm5 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[4],ymm30[4],ymm31[5],ymm30[5]
vextractf128 $1, %ymm4, %xmm4
vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2]
vblendps $3, %xmm4, %xmm5, %xmm4 # xmm4 = xmm4[0,1],xmm5[2,3]
vpunpckldq %ymm21, %ymm1, %ymm5 # ymm5 = ymm1[0],ymm21[0],ymm1[1],ymm21[1],ymm1[4],ymm21[4],ymm1[5],ymm21[5]
vshufps $36, 2672(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
# ymm5 = ymm5[0,1],mem[2,0],ymm5[4,5],mem[6,4]
vblendps $15, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
vmovups 560(%rsp), %zmm5 # 64-byte Reload
vmovupd %zmm0, 560(%rsp) # 64-byte Spill
vunpcklps %xmm30, %xmm31, %xmm0 # xmm0 = xmm31[0],xmm30[0],xmm31[1],xmm30[1]
vmovups (%rdi,%r10), %zmm30 {%k2} {z}
vinsertf64x4 $0, %ymm4, %zmm5, %zmm4
vmovaps %ymm29, %ymm5
vpermt2ps %ymm17, %ymm7, %ymm5
vinsertf64x4 $0, 1712(%rsp), %zmm23, %zmm7 # 32-byte Folded Reload
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm30, %zmm31
vmovups %zmm4, 288(%rsp) # 64-byte Spill
vinsertf64x4 $0, 880(%rsp), %zmm19, %zmm4 # 32-byte Folded Reload
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm19 {%k2} {z}
movq -32(%rsp), %rax # 8-byte Reload
vextractf128 $1, %ymm5, %xmm5
vblendps $3, %xmm5, %xmm9, %xmm5 # xmm5 = xmm5[0,1],xmm9[2,3]
vmovdqa %ymm1, %ymm9
vpermt2ps %ymm21, %ymm14, %ymm9
vblendps $192, 2864(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload
# ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
vmovupd %zmm7, 32(%rsp) # 64-byte Spill
vmovups 1328(%rsp), %zmm7 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm19, %zmm23
vmovupd %zmm4, 160(%rsp) # 64-byte Spill
vinsertps $76, %xmm29, %xmm17, %xmm4 # xmm4 = xmm29[1],xmm17[1],zero,zero
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm29 {%k2} {z}
vblendps $15, %ymm5, %ymm9, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
vinsertf128 $1, 2992(%rsp), %ymm0, %ymm9 # 16-byte Folded Reload
vinsertf64x4 $0, %ymm5, %zmm7, %zmm28
vmovupd 1136(%rsp), %zmm5 # 64-byte Reload
vmovupd 1968(%rsp), %zmm7 # 64-byte Reload
vpermt2pd 1648(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload
vpermt2pd 2032(%rsp), %zmm16, %zmm7 # 64-byte Folded Reload
vmovapd %zmm7, %zmm5 {%k4}
vmovapd %zmm5, %zmm7
vblendps $3, %xmm6, %xmm10, %xmm5 # xmm5 = xmm6[0,1],xmm10[2,3]
vinsertf64x4 $0, 1424(%rsp), %zmm12, %zmm6 # 32-byte Folded Reload
vmovups (%rdi,%rbp), %zmm10 {%k2} {z}
vmovupd %zmm6, 224(%rsp) # 64-byte Spill
vpunpckhdq %xmm21, %xmm1, %xmm6 # xmm6 = xmm1[2],xmm21[2],xmm1[3],xmm21[3]
vinsertf128 $1, %xmm6, %ymm0, %ymm6
vblendps $192, %ymm9, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
vmovupd 96(%rsp), %zmm9 # 64-byte Reload
vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
vmovdqa %xmm1, %xmm6
vinsertf64x4 $0, %ymm5, %zmm7, %zmm17
vinsertps $76, %xmm1, %xmm21, %xmm7 # xmm7 = xmm1[1],xmm21[1],zero,zero
vmovsd .LCPI0_195(%rip), %xmm1 # xmm1 = [3,7,0,0]
vshufps $226, %xmm11, %xmm8, %xmm5 # xmm5 = xmm8[2,0],xmm11[2,3]
vinsertf128 $1, 1520(%rsp), %ymm0, %ymm8 # 16-byte Folded Reload
vmovapd %zmm20, %zmm9 {%k4}
vinsertf64x4 $0, %ymm15, %zmm9, %zmm22
vmovups (%rdi,%r13), %zmm15 {%k2} {z}
vmovups (%rdi,%r11), %zmm9 {%k2} {z}
vpermt2ps %xmm21, %xmm1, %xmm6
vmovups 688(%rsp), %zmm1 # 64-byte Reload
vinsertf128 $1, %xmm6, %ymm0, %ymm6
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3]
vinsertf128 $1, %xmm7, %ymm0, %ymm4
vblendps $192, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
vmovups (%rdi,%rdx), %zmm8 {%k2} {z}
vblendps $15, %ymm5, %ymm6, %ymm5 # ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
vmovupd 1008(%rsp), %zmm6 # 64-byte Reload
vmovups %zmm15, 96(%rsp) # 64-byte Spill
vpermt2pd 1904(%rsp), %zmm16, %zmm6 # 64-byte Folded Reload
vinsertf64x4 $0, %ymm5, %zmm1, %zmm12
vmovupd 1200(%rsp), %zmm5 # 64-byte Reload
vmovupd 624(%rsp), %zmm1 # 64-byte Reload
vpermt2pd 944(%rsp), %zmm16, %zmm5 # 64-byte Folded Reload
vmovups (%rdi,%r15), %zmm16 {%k2} {z}
vmovapd %zmm25, %zmm1 {%k4}
vinsertf64x4 $0, %ymm13, %zmm1, %zmm14
vmovups (%rdi,%r14), %zmm1 {%k2} {z}
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm16, %zmm25
vmovapd %zmm6, %zmm5 {%k4}
vmovapd %zmm5, %zmm6
vinsertf128 $1, 1456(%rsp), %ymm0, %ymm5 # 16-byte Folded Reload
vblendps $192, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
vblendps $15, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rcx), %zmm4 {%k2} {z}
vinsertf64x4 $0, %ymm0, %zmm6, %zmm21
vmovupd 1264(%rsp), %zmm0 # 64-byte Reload
vmovups (%rdi,%r8), %zmm6 {%k2} {z}
vmovups %zmm1, 1264(%rsp) # 64-byte Spill
vmovaps %zmm4, %zmm7
vmovapd %zmm27, %zmm0 {%k4}
vinsertf64x4 $0, %ymm2, %zmm0, %zmm18
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpbroadcastd %r9d, %zmm0
vmovaps %zmm6, %zmm27
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
addl $-16, %r9d
vmovdqu64 %zmm0, 1200(%rsp) # 64-byte Spill
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_193(%rip), %zmm0, %k1
vmovupd 352(%rsp), %zmm13 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vunpcklps %xmm27, %xmm30, %xmm0 # xmm0 = xmm30[0],xmm27[0],xmm30[1],xmm27[1]
vmovups %zmm4, 1904(%rsp) # 64-byte Spill
vmovaps %zmm8, %zmm4
movq -96(%rsp), %rax # 8-byte Reload
vmovups %zmm16, 1648(%rsp) # 64-byte Spill
vmovups %zmm10, 1968(%rsp) # 64-byte Spill
vmovups %zmm6, 1776(%rsp) # 64-byte Spill
vmovups %zmm30, 1840(%rsp) # 64-byte Spill
vmovups %zmm4, 624(%rsp) # 64-byte Spill
vmovups %zmm9, 688(%rsp) # 64-byte Spill
vmovups %zmm29, 1712(%rsp) # 64-byte Spill
vmovlhps %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm10[0]
vinsertps $48, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[0]
vinsertf128 $1, %xmm15, %ymm0, %ymm2
vinsertf128 $1, %xmm1, %ymm0, %ymm0
vmovapd %zmm24, %zmm13 {%k4}
vinsertf64x4 $0, %ymm3, %zmm13, %zmm5
vbroadcastss %xmm9, %ymm3
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm11 {%k1} {z}
movq 512(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps 416(%rsp), %zmm11, %zmm31 # 64-byte Folded Reload
# zmm31 = (zmm11 * zmm31) + mem
vblendps $32, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
vshufpd $2, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[2]
vbroadcastss %xmm4, %ymm0
vblendps $128, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm3 {%k2} {z}
movq 520(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vinsertf64x4 $1, %ymm16, %zmm0, %zmm13
vmovaps .LCPI0_109(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,16,u,u,u,u,u,u]
vfmadd213ps %zmm5, %zmm11, %zmm25 # zmm25 = (zmm11 * zmm25) + zmm5
vbroadcastss %xmm4, %zmm5
vfmadd213ps %zmm18, %zmm11, %zmm23 # zmm23 = (zmm11 * zmm23) + zmm18
vfmadd213ps 1072(%rsp), %zmm11, %zmm5 # 64-byte Folded Reload
# zmm5 = (zmm11 * zmm5) + mem
vmovups %zmm3, 1136(%rsp) # 64-byte Spill
vpermt2ps %zmm19, %zmm0, %zmm13
vmovshdup %xmm13, %xmm0 # xmm0 = xmm13[1,1,3,3]
vbroadcastsd %xmm0, %zmm0
vfmadd213ps %zmm21, %zmm11, %zmm0 # zmm0 = (zmm11 * zmm0) + zmm21
vmovups %zmm0, 352(%rsp) # 64-byte Spill
vshufps $255, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[3,3,3,3]
vbroadcastsd %xmm0, %zmm20
vshufps $170, %xmm13, %xmm13, %xmm0 # xmm0 = xmm13[2,2,2,2]
vfmadd213ps %zmm12, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm20) + zmm12
vbroadcastsd %xmm0, %zmm12
vshufps $170, %ymm2, %ymm2, %ymm0 # ymm0 = ymm2[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[2,3,2,3,2,3,2,3]
vshufps $85, %ymm13, %ymm13, %ymm0 # ymm0 = ymm13[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm0, %zmm0, %zmm16 # zmm16 = zmm0[2,3,2,3,2,3,2,3]
vextractf128 $1, %ymm13, %xmm0
vfmadd213ps %zmm17, %zmm11, %zmm12 # zmm12 = (zmm11 * zmm12) + zmm17
vmovaps %zmm30, %zmm17
vbroadcastss %xmm3, %zmm13
vmovaps %zmm3, %zmm30
vfmadd213ps %zmm26, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm15) + zmm26
vbroadcastss %xmm29, %zmm26
vfmadd213ps %zmm28, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm16) + zmm28
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm28 {%k2} {z}
movq -80(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm0, %zmm8
vinsertps $76, %xmm17, %xmm27, %xmm0 # xmm0 = xmm17[1],xmm27[1],zero,zero
vfmadd213ps 288(%rsp), %zmm11, %zmm8 # 64-byte Folded Reload
# zmm8 = (zmm11 * zmm8) + mem
vfmadd213ps %zmm22, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm13) + zmm22
vshufps $212, %xmm10, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm10[1,3]
vmovups 1264(%rsp), %zmm10 # 64-byte Reload
vfmadd213ps %zmm14, %zmm11, %zmm26 # zmm26 = (zmm11 * zmm26) + zmm14
vinsertps $112, %xmm7, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm7[1]
vmovups 96(%rsp), %zmm7 # 64-byte Reload
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm24 {%k2} {z}
movq -88(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm28, %zmm1
vfmadd213ps 224(%rsp), %zmm11, %zmm1 # 64-byte Folded Reload
# zmm1 = (zmm11 * zmm1) + mem
vmovups %zmm28, 1328(%rsp) # 64-byte Spill
vmovshdup %xmm10, %xmm2 # xmm2 = xmm10[1,1,3,3]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm21 {%k2} {z}
movq -72(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vinsertf128 $1, %xmm2, %ymm0, %ymm2
vblendps $240, %ymm2, %ymm0, %ymm2 # ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7]
vinsertf128 $1, %xmm9, %ymm0, %ymm0
vbroadcastss %xmm24, %zmm14
vfmadd213ps 32(%rsp), %zmm11, %zmm14 # 64-byte Folded Reload
# zmm14 = (zmm11 * zmm14) + mem
vmovups %zmm24, 288(%rsp) # 64-byte Spill
vblendps $34, %ymm0, %ymm2, %ymm0 # ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
vshufps $85, %xmm7, %xmm7, %xmm2 # xmm2 = xmm7[1,1,1,1]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovups (%rdi,%rax), %zmm6 {%k2} {z}
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vinsertf128 $1, %xmm2, %ymm0, %ymm2
movq 528(%rsp), %rax # 8-byte Reload
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
addq $64, %rdi
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastss %xmm21, %zmm18
vfmadd213ps 560(%rsp), %zmm11, %zmm18 # 64-byte Folded Reload
# zmm18 = (zmm11 * zmm18) + mem
vmovups %zmm21, 416(%rsp) # 64-byte Spill
vbroadcastss %xmm6, %zmm3
vfmadd213ps 160(%rsp), %zmm11, %zmm3 # 64-byte Folded Reload
# zmm3 = (zmm11 * zmm3) + mem
vblendps $192, %ymm2, %ymm0, %ymm11 # ymm11 = ymm0[0,1,2,3,4,5],ymm2[6,7]
vmovdqu64 1200(%rsp), %zmm0 # 64-byte Reload
vmovups %zmm6, 560(%rsp) # 64-byte Spill
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_108(%rip){1to16}, %zmm0, %k1
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vbroadcastsd %xmm4, %ymm0
vmovups 1648(%rsp), %zmm4 # 64-byte Reload
vblendps $128, %ymm0, %ymm11, %ymm0 # ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7]
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7]
vshuff64x2 $85, %zmm2, %zmm2, %zmm27 # zmm27 = zmm2[2,3,2,3,2,3,2,3]
vmovaps %zmm19, %zmm2
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm22 {%k1} {z}
movq -64(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps %zmm5, %zmm22, %zmm27 # zmm27 = (zmm22 * zmm27) + zmm5
vshufps $170, %ymm11, %ymm11, %ymm5 # ymm5 = ymm11[2,2,2,2,6,6,6,6]
vmovaps .LCPI0_110(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,17,u,u,u,u,u,u,u]
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3]
vfmadd213ps %zmm15, %zmm22, %zmm5 # zmm5 = (zmm22 * zmm5) + zmm15
vmovaps .LCPI0_112(%rip), %zmm15 # zmm15 = [0,1,2,3,4,5,6,7,8,9,17,u,u,u,u,u]
vpermt2ps %zmm4, %zmm11, %zmm0
vmovaps .LCPI0_111(%rip), %zmm11 # zmm11 = [0,1,2,3,4,5,6,7,8,17,u,u,u,u,u,u]
vpermt2ps %zmm19, %zmm11, %zmm0
vmovaps %zmm0, %zmm11
vpermt2ps %zmm29, %zmm15, %zmm11
vextractf128 $1, %ymm11, %xmm11
vbroadcastss %xmm11, %zmm11
vfmadd213ps %zmm8, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm8
vshufps $85, %ymm0, %ymm0, %ymm8 # ymm8 = ymm0[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm8, %zmm8, %zmm15 # zmm15 = zmm8[2,3,2,3,2,3,2,3]
vshufps $170, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[2,2,2,2]
vbroadcastsd %xmm8, %zmm17
vshufps $255, %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[3,3,3,3]
vmovshdup %xmm0, %xmm0 # xmm0 = xmm0[1,1,3,3]
vbroadcastsd %xmm0, %zmm9
vmovshdup %xmm6, %xmm0 # xmm0 = xmm6[1,1,3,3]
vbroadcastsd %xmm8, %zmm19
vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16
vmovaps %zmm2, %zmm6
vfmadd213ps 352(%rsp), %zmm22, %zmm9 # 64-byte Folded Reload
# zmm9 = (zmm22 * zmm9) + mem
vmovups %zmm6, 1584(%rsp) # 64-byte Spill
vbroadcastsd %xmm0, %zmm0
vfmadd213ps %zmm12, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm12
vfmadd213ps %zmm20, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm20
vfmadd213ps %zmm3, %zmm22, %zmm0 # zmm0 = (zmm22 * zmm0) + zmm3
vmovshdup %xmm21, %xmm3 # xmm3 = xmm21[1,1,3,3]
vbroadcastsd %xmm3, %zmm8
vmovshdup %xmm24, %xmm3 # xmm3 = xmm24[1,1,3,3]
vmovups 1776(%rsp), %zmm24 # 64-byte Reload
vbroadcastsd %xmm3, %zmm12
vmovshdup %xmm28, %xmm3 # xmm3 = xmm28[1,1,3,3]
vmovaps %zmm4, %zmm28
vpermpd $85, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1]
vfmadd213ps %zmm18, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm18
vfmadd213ps %zmm14, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm14
vbroadcastsd %xmm3, %zmm14
vmovups 1840(%rsp), %zmm3 # 64-byte Reload
vfmadd213ps %zmm1, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm1
vmovshdup %xmm30, %xmm1 # xmm1 = xmm30[1,1,3,3]
vmovaps %zmm29, %zmm30
vbroadcastsd %xmm1, %zmm16
vmovshdup %xmm29, %xmm1 # xmm1 = xmm29[1,1,3,3]
vmovaps %zmm28, %zmm29
vbroadcastsd %xmm1, %zmm18
vmovshdup %xmm2, %xmm1 # xmm1 = xmm2[1,1,3,3]
vmovups 1904(%rsp), %zmm2 # 64-byte Reload
vfmadd213ps %zmm13, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm13
vbroadcastsd %xmm1, %zmm13
vmovshdup %xmm28, %xmm1 # xmm1 = xmm28[1,1,3,3]
vbroadcastsd %xmm1, %zmm20
vfmadd213ps %zmm26, %zmm22, %zmm18 # zmm18 = (zmm22 * zmm18) + zmm26
vmovdqu64 1200(%rsp), %zmm26 # 64-byte Reload
vfmadd213ps %zmm23, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm23
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_9(%rip){1to16}, %zmm26, %k1
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovshdup %xmm3, %xmm1 # xmm1 = xmm3[1,1,3,3]
vfmadd213ps %zmm25, %zmm22, %zmm20 # zmm20 = (zmm22 * zmm20) + zmm25
vmovaps %zmm3, %zmm25
vbroadcastss %xmm1, %zmm21
vunpckhps %xmm24, %xmm3, %xmm1 # xmm1 = xmm3[2],xmm24[2],xmm3[3],xmm24[3]
vblendps $12, 1968(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
# xmm1 = xmm1[0,1],mem[2,3]
vpermilps $170, 688(%rsp), %xmm3 # 16-byte Folded Reload
# xmm3 = mem[2,2,2,2]
vfmadd213ps %zmm31, %zmm22, %zmm21 # zmm21 = (zmm22 * zmm21) + zmm31
vmovaps %zmm6, %zmm31
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm22 {%k1} {z}
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_107(%rip){1to16}, %zmm26, %k1
movq 536(%rsp), %rax # 8-byte Reload
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vinsertps $176, %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2],xmm2[2]
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vblendps $240, %ymm4, %ymm1, %ymm4 # ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
vblendps $32, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
vinsertf128 $1, %xmm7, %ymm1, %ymm4
vblendps $204, %ymm4, %ymm3, %ymm1 # ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
vmovsldup 624(%rsp), %xmm3 # 16-byte Folded Reload
# xmm3 = mem[0,0,2,2]
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vblendps $128, %ymm3, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
vmovapd .LCPI0_113(%rip), %zmm3 # zmm3 = [0,1,2,3,9,u,u,u]
vmovaps %zmm1, %zmm23
vshufps $255, %ymm1, %ymm1, %ymm1 # ymm1 = ymm1[3,3,3,3,7,7,7,7]
vpermt2pd %zmm28, %zmm3, %zmm23
vmovaps .LCPI0_114(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,18,u,u,u,u,u,u]
vpermt2ps %zmm6, %zmm3, %zmm23
vmovshdup %xmm23, %xmm3 # xmm3 = xmm23[1,1,3,3]
vshufps $170, %xmm23, %xmm23, %xmm7 # xmm7 = xmm23[2,2,2,2]
vshufps $255, %xmm23, %xmm23, %xmm6 # xmm6 = xmm23[3,3,3,3]
vbroadcastsd %xmm3, %zmm28
vbroadcastsd %xmm7, %zmm7
vbroadcastsd %xmm6, %zmm6
vfmadd213ps %zmm9, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm28) + zmm9
vshufps $85, %ymm23, %ymm23, %ymm9 # ymm9 = ymm23[1,1,1,1,5,5,5,5]
vfmadd213ps %zmm17, %zmm22, %zmm7 # zmm7 = (zmm22 * zmm7) + zmm17
vshuff64x2 $85, %zmm1, %zmm1, %zmm17 # zmm17 = zmm1[2,3,2,3,2,3,2,3]
vshufps $170, %ymm4, %ymm4, %ymm1 # ymm1 = ymm4[2,2,2,2,6,6,6,6]
vfmadd213ps %zmm19, %zmm22, %zmm6 # zmm6 = (zmm22 * zmm6) + zmm19
vmovups 1968(%rsp), %zmm4 # 64-byte Reload
vshuff64x2 $85, %zmm9, %zmm9, %zmm10 # zmm10 = zmm9[2,3,2,3,2,3,2,3]
vextractf32x4 $1, %ymm23, %xmm9
vshuff64x2 $85, %zmm1, %zmm1, %zmm19 # zmm19 = zmm1[2,3,2,3,2,3,2,3]
vfmadd213ps %zmm27, %zmm22, %zmm17 # zmm17 = (zmm22 * zmm17) + zmm27
vmovups 1328(%rsp), %zmm27 # 64-byte Reload
vbroadcastss %xmm9, %zmm3
vfmadd213ps %zmm15, %zmm22, %zmm10 # zmm10 = (zmm22 * zmm10) + zmm15
vfmadd213ps %zmm5, %zmm22, %zmm19 # zmm19 = (zmm22 * zmm19) + zmm5
vmovups 1264(%rsp), %zmm5 # 64-byte Reload
vfmadd213ps %zmm11, %zmm22, %zmm3 # zmm3 = (zmm22 * zmm3) + zmm11
vmovups %zmm3, 32(%rsp) # 64-byte Spill
vmovaps %zmm25, %zmm3
vshufpd $1, %xmm3, %xmm3, %xmm1 # xmm1 = xmm3[1,0]
vmovups 560(%rsp), %zmm25 # 64-byte Reload
vbroadcastss %xmm1, %zmm23
vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2]
vbroadcastsd %xmm1, %zmm9
vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2]
vmovups 416(%rsp), %zmm31 # 64-byte Reload
vfmadd213ps %zmm21, %zmm22, %zmm23 # zmm23 = (zmm22 * zmm23) + zmm21
vmovaps %zmm29, %zmm21
vmovups 1136(%rsp), %zmm29 # 64-byte Reload
vbroadcastsd %xmm1, %zmm11
vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2]
vmovups 288(%rsp), %zmm30 # 64-byte Reload
vfmadd213ps %zmm20, %zmm22, %zmm9 # zmm9 = (zmm22 * zmm9) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm20 {%k1} {z}
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_8(%rip){1to16}, %zmm26, %k1
movq -56(%rsp), %rax # 8-byte Reload
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps %zmm13, %zmm22, %zmm11 # zmm11 = (zmm22 * zmm11) + zmm13
vbroadcastsd %xmm1, %zmm13
vfmadd213ps %zmm18, %zmm22, %zmm13 # zmm13 = (zmm22 * zmm13) + zmm18
vshufps $170, %xmm29, %xmm29, %xmm1 # xmm1 = xmm29[2,2,2,2]
vbroadcastsd %xmm1, %zmm15
vshufps $170, %xmm27, %xmm27, %xmm1 # xmm1 = xmm27[2,2,2,2]
vfmadd213ps %zmm16, %zmm22, %zmm15 # zmm15 = (zmm22 * zmm15) + zmm16
vbroadcastsd %xmm1, %zmm16
vshufps $170, %xmm30, %xmm30, %xmm1 # xmm1 = xmm30[2,2,2,2]
vfmadd213ps %zmm14, %zmm22, %zmm16 # zmm16 = (zmm22 * zmm16) + zmm14
vbroadcastsd %xmm1, %zmm14
vshufps $170, %xmm31, %xmm31, %xmm1 # xmm1 = xmm31[2,2,2,2]
vfmadd213ps %zmm12, %zmm22, %zmm14 # zmm14 = (zmm22 * zmm14) + zmm12
vbroadcastsd %xmm1, %zmm12
vshufps $170, %xmm25, %xmm25, %xmm1 # xmm1 = xmm25[2,2,2,2]
vfmadd213ps %zmm8, %zmm22, %zmm12 # zmm12 = (zmm22 * zmm12) + zmm8
vbroadcastsd %xmm1, %zmm8
vshufps $255, %xmm5, %xmm5, %xmm1 # xmm1 = xmm5[3,3,3,3]
vfmadd213ps %zmm0, %zmm22, %zmm8 # zmm8 = (zmm22 * zmm8) + zmm0
vshufps $51, %xmm3, %xmm24, %xmm0 # xmm0 = xmm24[3,0],xmm3[3,0]
vmovups 688(%rsp), %zmm3 # 64-byte Reload
vshufps $242, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[2,0],xmm4[3,3]
vblendps $8, %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm2[3]
vinsertf128 $1, %xmm1, %ymm0, %ymm1
vblendps $240, %ymm1, %ymm0, %ymm1 # ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
vpermpd $85, %ymm3, %ymm2 # ymm2 = ymm3[1,1,1,1]
vblendps $32, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
vmovshdup 96(%rsp), %xmm2 # 16-byte Folded Reload
# xmm2 = mem[1,1,3,3]
vinsertf128 $1, %xmm2, %ymm0, %ymm2
vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
vshufps $170, %ymm1, %ymm1, %ymm2 # ymm2 = ymm1[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm2, %zmm2, %zmm24 # zmm24 = zmm2[2,3,2,3,2,3,2,3]
vmovups 624(%rsp), %zmm2 # 64-byte Reload
vfmadd213ps %zmm19, %zmm20, %zmm24 # zmm24 = (zmm20 * zmm24) + zmm19
vinsertf128 $1, %xmm2, %ymm0, %ymm0
vblendps $136, %ymm0, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
vshufps $255, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[3,3,3,3,7,7,7,7]
vshuff64x2 $85, %zmm0, %zmm0, %zmm18 # zmm18 = zmm0[2,3,2,3,2,3,2,3]
vmovaps .LCPI0_115(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,19,u,u,u,u,u,u,u]
vfmadd213ps %zmm17, %zmm20, %zmm18 # zmm18 = (zmm20 * zmm18) + zmm17
vmovups 1584(%rsp), %zmm17 # 64-byte Reload
vpermt2ps %zmm21, %zmm0, %zmm1
vmovaps .LCPI0_116(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,19,u,u,u,u,u,u]
vpermt2ps %zmm17, %zmm0, %zmm1
vshufps $85, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
vfmadd213ps %zmm10, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm10
vmovups %zmm0, 160(%rsp) # 64-byte Spill
vshufps $170, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[2,2,2,2]
vbroadcastsd %xmm0, %zmm22
vshufps $255, %xmm1, %xmm1, %xmm0 # xmm0 = xmm1[3,3,3,3]
vfmadd213ps %zmm7, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm22) + zmm7
vbroadcastsd %xmm0, %zmm7
vmovshdup %xmm1, %xmm0 # xmm0 = xmm1[1,1,3,3]
vbroadcastsd %xmm0, %zmm10
vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3]
vmovaps %zmm29, %zmm25
vmovups 1840(%rsp), %zmm29 # 64-byte Reload
vbroadcastsd %xmm0, %zmm0
vfmadd213ps %zmm6, %zmm20, %zmm7 # zmm7 = (zmm20 * zmm7) + zmm6
vmovups 1904(%rsp), %zmm6 # 64-byte Reload
vfmadd213ps %zmm28, %zmm20, %zmm10 # zmm10 = (zmm20 * zmm10) + zmm28
vfmadd213ps %zmm8, %zmm20, %zmm0 # zmm0 = (zmm20 * zmm0) + zmm8
vmovups %zmm0, 224(%rsp) # 64-byte Spill
vshufps $255, %xmm31, %xmm31, %xmm0 # xmm0 = xmm31[3,3,3,3]
vmovaps %zmm21, %zmm31
vbroadcastsd %xmm0, %zmm19
vshufps $255, %xmm30, %xmm30, %xmm0 # xmm0 = xmm30[3,3,3,3]
vmovaps %zmm17, %zmm30
vbroadcastsd %xmm0, %zmm8
vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3]
vmovups 1712(%rsp), %zmm27 # 64-byte Reload
vfmadd213ps %zmm12, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm19) + zmm12
vbroadcastsd %xmm0, %zmm12
vshufps $255, %xmm25, %xmm25, %xmm0 # xmm0 = xmm25[3,3,3,3]
vfmadd213ps %zmm14, %zmm20, %zmm8 # zmm8 = (zmm20 * zmm8) + zmm14
vbroadcastsd %xmm0, %zmm14
vfmadd213ps %zmm16, %zmm20, %zmm12 # zmm12 = (zmm20 * zmm12) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm16 {%k1} {z}
movq (%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps %zmm15, %zmm20, %zmm14 # zmm14 = (zmm20 * zmm14) + zmm15
vshufps $255, %xmm27, %xmm27, %xmm0 # xmm0 = xmm27[3,3,3,3]
vbroadcastsd %xmm0, %zmm15
vshufps $255, %xmm17, %xmm17, %xmm0 # xmm0 = xmm17[3,3,3,3]
vfmadd213ps %zmm13, %zmm20, %zmm15 # zmm15 = (zmm20 * zmm15) + zmm13
vbroadcastsd %xmm0, %zmm13
vshufps $255, %xmm21, %xmm21, %xmm0 # xmm0 = xmm21[3,3,3,3]
vfmadd213ps %zmm11, %zmm20, %zmm13 # zmm13 = (zmm20 * zmm13) + zmm11
vbroadcastsd %xmm0, %zmm11
vshufps $255, %xmm29, %xmm29, %xmm0 # xmm0 = xmm29[3,3,3,3]
vbroadcastss %xmm0, %zmm17
vmovaps .LCPI0_117(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,19,u,u,u,u,u]
vfmadd213ps %zmm9, %zmm20, %zmm11 # zmm11 = (zmm20 * zmm11) + zmm9
vfmadd213ps %zmm23, %zmm20, %zmm17 # zmm17 = (zmm20 * zmm17) + zmm23
vpermt2ps %zmm27, %zmm0, %zmm1
vextractf128 $1, %ymm1, %xmm0
vbroadcastss %xmm0, %zmm9
vfmadd213ps 32(%rsp), %zmm20, %zmm9 # 64-byte Folded Reload
# zmm9 = (zmm20 * zmm9) + mem
vmovups 1776(%rsp), %zmm20 # 64-byte Reload
vunpcklps %ymm20, %ymm29, %ymm0 # ymm0 = ymm29[0],ymm20[0],ymm29[1],ymm20[1],ymm29[4],ymm20[4],ymm29[5],ymm20[5]
vextractf128 $1, %ymm0, %xmm1
vextractf128 $1, %ymm4, %xmm0
vextractf128 $1, %ymm6, %xmm4
vmovaps %xmm0, 880(%rsp) # 16-byte Spill
vmovlhps %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovaps %xmm4, 2032(%rsp) # 16-byte Spill
vinsertps $48, %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm4[0]
vextractf128 $1, %ymm3, %xmm4
vmovaps %zmm31, %zmm3
vblendps $240, %ymm5, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
vmovaps %zmm30, %zmm5
vbroadcastss %xmm4, %ymm4
vblendps $32, %ymm4, %ymm0, %ymm4 # ymm4 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
vmovapd .LCPI0_118(%rip), %ymm0 # ymm0 = [0,1,2,6]
vpermt2pd 96(%rsp), %ymm0, %ymm4 # 32-byte Folded Reload
vextractf128 $1, %ymm2, %xmm0
vbroadcastss %xmm0, %zmm0
vfmadd231ps %zmm0, %zmm16, %zmm18 # zmm18 = (zmm16 * zmm0) + zmm18
vblendps $128, %ymm0, %ymm4, %ymm0 # ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
vshufps $170, %ymm4, %ymm4, %ymm4 # ymm4 = ymm4[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm4, %zmm4, %zmm21 # zmm21 = zmm4[2,3,2,3,2,3,2,3]
vbroadcastss %xmm1, %zmm4
vextractf128 $1, %ymm3, %xmm1
vshuff64x2 $212, %zmm31, %zmm0, %zmm0 # zmm0 = zmm0[0,1,2,3],zmm31[2,3,6,7]
vmovups 288(%rsp), %zmm3 # 64-byte Reload
vmovups 784(%rsp), %ymm31 # 32-byte Reload
vfmadd213ps %zmm24, %zmm16, %zmm21 # zmm21 = (zmm16 * zmm21) + zmm24
vbroadcastss %xmm1, %zmm24
vextractf128 $1, %ymm5, %xmm1
vfmadd213ps %zmm17, %zmm16, %zmm4 # zmm4 = (zmm16 * zmm4) + zmm17
vbroadcastss %xmm1, %zmm23
vextractf32x4 $1, %ymm27, %xmm1
vfmadd213ps %zmm11, %zmm16, %zmm24 # zmm24 = (zmm16 * zmm24) + zmm11
vbroadcastss %xmm1, %zmm2
vmovaps .LCPI0_119(%rip), %zmm1 # zmm1 = [0,1,2,3,4,5,6,7,8,20,u,u,u,u,u,u]
vfmadd213ps %zmm13, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm23) + zmm13
vextractf128 $1, %ymm3, %xmm5
vmovups 416(%rsp), %zmm3 # 64-byte Reload
vfmadd213ps %zmm15, %zmm16, %zmm2 # zmm2 = (zmm16 * zmm2) + zmm15
vbroadcastss %xmm5, %zmm5
vfmadd213ps %zmm8, %zmm16, %zmm5 # zmm5 = (zmm16 * zmm5) + zmm8
vmovupd 2288(%rsp), %zmm8 # 64-byte Reload
vpermt2ps %zmm30, %zmm1, %zmm0
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vbroadcastsd %xmm1, %zmm28
vextractf32x4 $1, %ymm25, %xmm1
vmovaps %zmm2, %zmm25
vbroadcastss %xmm1, %zmm26
vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3]
vfmadd213ps %zmm10, %zmm16, %zmm28 # zmm28 = (zmm16 * zmm28) + zmm10
vinsertps $76, %xmm24, %xmm23, %xmm10 # xmm10 = xmm24[1],xmm23[1],zero,zero
vbroadcastsd %xmm1, %zmm17
vmovups 1328(%rsp), %zmm1 # 64-byte Reload
vfmadd213ps %zmm14, %zmm16, %zmm26 # zmm26 = (zmm16 * zmm26) + zmm14
vfmadd213ps %zmm7, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm17) + zmm7
vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm7, %zmm7, %zmm27 # zmm27 = zmm7[2,3,2,3,2,3,2,3]
vextractf128 $1, %ymm3, %xmm7
vpermpd $170, %ymm6, %ymm3 # ymm3 = ymm6[2,2,2,2]
vunpcklps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[0],xmm23[0],xmm24[1],xmm23[1]
vfmadd213ps 160(%rsp), %zmm16, %zmm27 # 64-byte Folded Reload
# zmm27 = (zmm16 * zmm27) + mem
vbroadcastss %xmm7, %zmm13
vfmadd213ps %zmm19, %zmm16, %zmm13 # zmm13 = (zmm16 * zmm13) + zmm19
vunpcklps %zmm23, %zmm24, %zmm19 # zmm19 = zmm24[0],zmm23[0],zmm24[1],zmm23[1],zmm24[4],zmm23[4],zmm24[5],zmm23[5],zmm24[8],zmm23[8],zmm24[9],zmm23[9],zmm24[12],zmm23[12],zmm24[13],zmm23[13]
vextractf128 $1, %ymm1, %xmm1
vbroadcastss %xmm1, %zmm14
vshufps $170, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[2,2,2,2]
vextractf128 $1, %ymm0, %xmm0
vbroadcastsd %xmm1, %zmm1
vbroadcastss %xmm0, %zmm7
vmovups 560(%rsp), %zmm0 # 64-byte Reload
vfmadd213ps %zmm12, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm14) + zmm12
vmovapd %zmm8, %zmm12
vfmadd213ps %zmm22, %zmm16, %zmm1 # zmm1 = (zmm16 * zmm1) + zmm22
vfmadd213ps %zmm9, %zmm16, %zmm7 # zmm7 = (zmm16 * zmm7) + zmm9
vinsertps $76, %xmm4, %xmm28, %xmm9 # xmm9 = xmm4[1],xmm28[1],zero,zero
vextractf128 $1, %ymm0, %xmm0
vbroadcastss %xmm0, %zmm11
vmovaps %ymm29, %ymm0
vpermt2ps %ymm20, %ymm31, %ymm0
vfmadd213ps 224(%rsp), %zmm16, %zmm11 # 64-byte Folded Reload
# zmm11 = (zmm16 * zmm11) + mem
vmovaps %zmm13, %zmm16
vmovaps .LCPI0_18(%rip), %zmm29 # zmm29 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15]
vmovaps .LCPI0_15(%rip), %zmm20 # zmm20 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15]
vextractf128 $1, %ymm0, %xmm0
vshufps $212, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
# xmm0 = xmm0[0,1],mem[1,3]
vblendps $8, %xmm3, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm3[3]
vunpcklps %xmm5, %xmm14, %xmm3 # xmm3 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
vmovups %ymm0, 224(%rsp) # 32-byte Spill
vunpcklps %xmm11, %xmm13, %xmm0 # xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
vpermt2pd %zmm0, %zmm8, %zmm3
vunpcklps %xmm26, %xmm2, %xmm0 # xmm0 = xmm2[0],xmm26[0],xmm2[1],xmm26[1]
vpermt2pd %zmm0, %zmm8, %zmm6
vmovaps %xmm18, %xmm0
vunpcklps %xmm28, %xmm4, %xmm8 # xmm8 = xmm4[0],xmm28[0],xmm4[1],xmm28[1]
vmovapd %zmm3, %zmm6 {%k4}
vmovddup .LCPI0_194(%rip), %xmm3 # xmm3 = [4,0,4,0]
# xmm3 = mem[0,0]
vpermt2ps %xmm21, %xmm3, %xmm0
vunpcklps %xmm27, %xmm7, %xmm3 # xmm3 = xmm7[0],xmm27[0],xmm7[1],xmm27[1]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
vmovlhps %xmm1, %xmm17, %xmm3 # xmm3 = xmm17[0],xmm1[0]
vshufps $36, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,0]
vinsertps $76, %xmm14, %xmm5, %xmm8 # xmm8 = xmm14[1],xmm5[1],zero,zero
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
vinsertps $76, %xmm13, %xmm11, %xmm3 # xmm3 = xmm13[1],xmm11[1],zero,zero
vpermt2pd %zmm3, %zmm12, %zmm8
vinsertps $76, %xmm2, %xmm26, %xmm3 # xmm3 = xmm2[1],xmm26[1],zero,zero
vpermt2pd %zmm3, %zmm12, %zmm10
vunpcklps %xmm18, %xmm21, %xmm3 # xmm3 = xmm21[0],xmm18[0],xmm21[1],xmm18[1]
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vmovapd %zmm8, %zmm10 {%k4}
vinsertps $76, %xmm7, %xmm27, %xmm8 # xmm8 = xmm7[1],xmm27[1],zero,zero
vinsertf128 $1, %xmm8, %ymm0, %ymm8
vinsertf64x4 $0, %ymm0, %zmm6, %zmm0
vunpckhps %xmm23, %xmm24, %xmm6 # xmm6 = xmm24[2],xmm23[2],xmm24[3],xmm23[3]
vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
vunpcklps %xmm17, %xmm1, %xmm8 # xmm8 = xmm1[0],xmm17[0],xmm1[1],xmm17[1]
vmovupd %zmm0, 1008(%rsp) # 64-byte Spill
vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3]
vunpckhps %xmm28, %xmm4, %xmm9 # xmm9 = xmm4[2],xmm28[2],xmm4[3],xmm28[3]
vblendps $15, %ymm8, %ymm3, %ymm0 # ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7]
vunpckhps %xmm11, %xmm13, %xmm3 # xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
vunpckhps %xmm5, %xmm14, %xmm8 # xmm8 = xmm14[2],xmm5[2],xmm14[3],xmm5[3]
vpermt2pd %zmm3, %zmm12, %zmm8
vunpckhps %xmm26, %xmm2, %xmm3 # xmm3 = xmm2[2],xmm26[2],xmm2[3],xmm26[3]
vpermt2pd %zmm3, %zmm12, %zmm6
vinsertps $179, %xmm18, %xmm21, %xmm3 # xmm3 = zero,zero,xmm21[2],xmm18[2]
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vmovapd %zmm8, %zmm6 {%k4}
vunpckhps %xmm27, %xmm7, %xmm8 # xmm8 = xmm7[2],xmm27[2],xmm7[3],xmm27[3]
vinsertf128 $1, %xmm8, %ymm0, %ymm8
vinsertf64x4 $0, %ymm0, %zmm10, %zmm0
vmovaps .LCPI0_19(%rip), %zmm10 # zmm10 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28]
vblendps $192, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
vinsertps $179, %xmm17, %xmm1, %xmm8 # xmm8 = zero,zero,xmm1[2],xmm17[2]
vmovupd %zmm0, 1520(%rsp) # 64-byte Spill
vblendps $3, %xmm9, %xmm8, %xmm8 # xmm8 = xmm9[0,1],xmm8[2,3]
vblendps $15, %ymm8, %ymm3, %ymm9 # ymm9 = ymm8[0,1,2,3],ymm3[4,5,6,7]
vmovapd .LCPI0_20(%rip), %zmm3 # zmm3 = [0,8,0,8,4,12,4,13]
vunpcklps %zmm5, %zmm14, %zmm8 # zmm8 = zmm14[0],zmm5[0],zmm14[1],zmm5[1],zmm14[4],zmm5[4],zmm14[5],zmm5[5],zmm14[8],zmm5[8],zmm14[9],zmm5[9],zmm14[12],zmm5[12],zmm14[13],zmm5[13]
vmovups %zmm8, 2544(%rsp) # 64-byte Spill
vinsertf64x4 $0, %ymm9, %zmm6, %zmm6
vmovaps %zmm2, %zmm9
vmovupd %zmm6, 32(%rsp) # 64-byte Spill
vunpckhps %zmm23, %zmm24, %zmm6 # zmm6 = zmm24[2],zmm23[2],zmm24[3],zmm23[3],zmm24[6],zmm23[6],zmm24[7],zmm23[7],zmm24[10],zmm23[10],zmm24[11],zmm23[11],zmm24[14],zmm23[14],zmm24[15],zmm23[15]
vpermt2ps %zmm11, %zmm10, %zmm16
vpermt2ps %zmm26, %zmm10, %zmm25
vmovaps %zmm10, %zmm12
vpermt2pd %zmm16, %zmm3, %zmm8
vshufpd $32, %zmm25, %zmm19, %zmm0 # zmm0 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[5],zmm19[6],zmm25[6]
vmovapd %zmm8, %zmm0 {%k4}
vmovaps %zmm21, %zmm8
vpermt2ps %zmm18, %zmm10, %zmm8
vunpcklps %zmm27, %zmm7, %zmm10 # zmm10 = zmm7[0],zmm27[0],zmm7[1],zmm27[1],zmm7[4],zmm27[4],zmm7[5],zmm27[5],zmm7[8],zmm27[8],zmm7[9],zmm27[9],zmm7[12],zmm27[12],zmm7[13],zmm27[13]
vmovups %zmm10, 816(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm10, %zmm10, %zmm10 # zmm10 = zmm10[4,5,4,5,4,5,4,5]
vmovups %zmm8, 352(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0,1,2],ymm8[3]
vmovaps %zmm1, %zmm10
vpermt2ps %zmm17, %zmm12, %zmm10
vunpcklps %zmm28, %zmm4, %zmm12 # zmm12 = zmm4[0],zmm28[0],zmm4[1],zmm28[1],zmm4[4],zmm28[4],zmm4[5],zmm28[5],zmm4[8],zmm28[8],zmm4[9],zmm28[9],zmm4[12],zmm28[12],zmm4[13],zmm28[13]
vmovups %zmm12, 3120(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm12, %xmm12
vmovups %zmm10, 3184(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm10, %xmm10
vblendps $3, %xmm12, %xmm10, %xmm10 # xmm10 = xmm12[0,1],xmm10[2,3]
vblendpd $3, %ymm10, %ymm8, %ymm8 # ymm8 = ymm10[0,1],ymm8[2,3]
vunpcklps %zmm17, %zmm1, %zmm10 # zmm10 = zmm1[0],zmm17[0],zmm1[1],zmm17[1],zmm1[4],zmm17[4],zmm1[5],zmm17[5],zmm1[8],zmm17[8],zmm1[9],zmm17[9],zmm1[12],zmm17[12],zmm1[13],zmm17[13]
vmovapd %ymm8, %ymm22
vmovaps %zmm4, %zmm8
vpermt2ps %zmm28, %zmm29, %zmm8
vmovups %zmm10, 3056(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm10, %xmm10
vextractf32x4 $2, %zmm8, %xmm12
vmovups %zmm8, 2992(%rsp) # 64-byte Spill
vblendps $3, %xmm12, %xmm10, %xmm8 # xmm8 = xmm12[0,1],xmm10[2,3]
vmovaps %zmm13, %zmm12
vunpckhps %zmm5, %zmm14, %zmm10 # zmm10 = zmm14[2],zmm5[2],zmm14[3],zmm5[3],zmm14[6],zmm5[6],zmm14[7],zmm5[7],zmm14[10],zmm5[10],zmm14[11],zmm5[11],zmm14[14],zmm5[14],zmm14[15],zmm5[15]
vmovups %ymm8, 1424(%rsp) # 32-byte Spill
vmovaps .LCPI0_17(%rip), %zmm8 # zmm8 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30]
vmovups %zmm10, 2480(%rsp) # 64-byte Spill
vpermt2ps %zmm11, %zmm8, %zmm12
vpermt2ps %zmm26, %zmm8, %zmm9
vpermt2pd %zmm12, %zmm3, %zmm10
vmovaps %zmm21, %zmm3
vpermt2ps %zmm18, %zmm8, %zmm3
vshufpd $32, %zmm9, %zmm6, %zmm15 # zmm15 = zmm6[0],zmm9[0],zmm6[2],zmm9[2],zmm6[4],zmm9[5],zmm6[6],zmm9[6]
vmovapd %zmm10, %zmm15 {%k4}
vmovups %zmm3, 2928(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm3, %zmm3, %zmm10 # zmm10 = zmm3[4,5,4,5,4,5,4,5]
vunpckhps %zmm27, %zmm7, %zmm3 # zmm3 = zmm7[2],zmm27[2],zmm7[3],zmm27[3],zmm7[6],zmm27[6],zmm7[7],zmm27[7],zmm7[10],zmm27[10],zmm7[11],zmm27[11],zmm7[14],zmm27[14],zmm7[15],zmm27[15]
vmovups %zmm3, 2864(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5]
vinsertf64x4 $0, %ymm22, %zmm0, %zmm0
vblendpd $8, %ymm10, %ymm3, %ymm3 # ymm3 = ymm3[0,1,2],ymm10[3]
vmovaps %zmm1, %zmm10
vpermt2ps %zmm17, %zmm8, %zmm10
vunpckhps %zmm28, %zmm4, %zmm8 # zmm8 = zmm4[2],zmm28[2],zmm4[3],zmm28[3],zmm4[6],zmm28[6],zmm4[7],zmm28[7],zmm4[10],zmm28[10],zmm4[11],zmm28[11],zmm4[14],zmm28[14],zmm4[15],zmm28[15]
vmovupd %zmm0, 944(%rsp) # 64-byte Spill
vunpcklps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[0],ymm17[0],ymm1[1],ymm17[1],ymm1[4],ymm17[4],ymm1[5],ymm17[5]
vmovups %zmm8, 2736(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm8, %xmm8
vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2]
vmovups %zmm10, 2800(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm10, %xmm10
vblendps $3, %xmm8, %xmm10, %xmm8 # xmm8 = xmm8[0,1],xmm10[2,3]
vblendpd $3, %ymm8, %ymm3, %ymm10 # ymm10 = ymm8[0,1],ymm3[2,3]
vmovaps %zmm4, %zmm8
vpermt2ps %zmm28, %zmm20, %zmm8
vunpckhps %zmm17, %zmm1, %zmm3 # zmm3 = zmm1[2],zmm17[2],zmm1[3],zmm17[3],zmm1[6],zmm17[6],zmm1[7],zmm17[7],zmm1[10],zmm17[10],zmm1[11],zmm17[11],zmm1[14],zmm17[14],zmm1[15],zmm17[15]
vmovups %zmm3, 2672(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm3, %xmm3
vinsertf64x4 $0, %ymm10, %zmm15, %zmm22
vmovaps %zmm7, %zmm10
vpermt2ps %zmm27, %zmm29, %zmm10
vmovaps %zmm7, %zmm15
vpermt2ps %zmm27, %zmm20, %zmm15
vmovaps .LCPI0_26(%rip), %ymm20 # ymm20 = [0,1,0,8,4,5,4,12]
vunpcklps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5]
vmovups %zmm8, 2608(%rsp) # 64-byte Spill
vextractf32x4 $2, %zmm8, %xmm8
vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3]
vmovups %ymm3, 2448(%rsp) # 32-byte Spill
vunpcklps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm28[0],ymm4[1],ymm28[1],ymm4[4],ymm28[4],ymm4[5],ymm28[5]
vextractf128 $1, %ymm3, %xmm3
vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3]
vunpcklps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[0],zmm18[0],zmm21[1],zmm18[1],zmm21[4],zmm18[4],zmm21[5],zmm18[5],zmm21[8],zmm18[8],zmm21[9],zmm18[9],zmm21[12],zmm18[12],zmm21[13],zmm18[13]
vmovups %zmm3, 3376(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $170, %zmm10, %zmm10, %zmm8 # zmm8 = zmm10[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3]
vshuff64x2 $170, %zmm15, %zmm15, %zmm8 # zmm8 = zmm15[4,5,4,5,4,5,4,5]
vmovapd %ymm3, %ymm30
vunpckhps %zmm18, %zmm21, %zmm3 # zmm3 = zmm21[2],zmm18[2],zmm21[3],zmm18[3],zmm21[6],zmm18[6],zmm21[7],zmm18[7],zmm21[10],zmm18[10],zmm21[11],zmm18[11],zmm21[14],zmm18[14],zmm21[15],zmm18[15]
vmovups %zmm3, 3312(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1,2],ymm3[3]
vunpcklps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm27[0],ymm7[1],ymm27[1],ymm7[4],ymm27[4],ymm7[5],ymm27[5]
vmovupd %ymm3, 2416(%rsp) # 32-byte Spill
vunpcklpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[0],ymm21[0],ymm18[2],ymm21[2]
vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4]
vmovups 2160(%rsp), %zmm8 # 64-byte Reload
vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
vmovaps %zmm2, %zmm3
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29
vpermt2ps %zmm26, %zmm8, %zmm3
vunpcklps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[4],ymm23[4],ymm24[5],ymm23[5]
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6]
vmovaps %ymm13, %ymm8
vpermt2ps %ymm11, %ymm20, %ymm8
vmovaps .LCPI0_24(%rip), %ymm20 # ymm20 = [0,1,2,10,4,5,6,14]
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7]
vunpckhps %ymm27, %ymm7, %ymm8 # ymm8 = ymm7[2],ymm27[2],ymm7[3],ymm27[3],ymm7[6],ymm27[6],ymm7[7],ymm27[7]
vunpckhps %ymm5, %ymm14, %ymm29 # ymm29 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7]
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0
vunpckhps %ymm28, %ymm4, %ymm3 # ymm3 = ymm4[2],ymm28[2],ymm4[3],ymm28[3],ymm4[6],ymm28[6],ymm4[7],ymm28[7]
vextractf128 $1, %ymm3, %xmm3
vmovupd %zmm0, 1456(%rsp) # 64-byte Spill
vunpckhps %ymm17, %ymm1, %ymm0 # ymm0 = ymm1[2],ymm17[2],ymm1[3],ymm17[3],ymm1[6],ymm17[6],ymm1[7],ymm17[7]
vpermpd $170, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2]
vblendps $3, %xmm3, %xmm0, %xmm0 # xmm0 = xmm3[0,1],xmm0[2,3]
vunpckhpd %ymm21, %ymm18, %ymm3 # ymm3 = ymm18[1],ymm21[1],ymm18[3],ymm21[3]
vshufps $36, %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4]
vmovups 2224(%rsp), %zmm8 # 64-byte Reload
vblendps $15, %ymm0, %ymm3, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
vmovaps %zmm2, %zmm3
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29
vpermt2ps %zmm26, %zmm8, %zmm3
vunpckhps %ymm23, %ymm24, %ymm8 # ymm8 = ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[6],ymm23[6],ymm24[7],ymm23[7]
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm3, %zmm8, %zmm3 # zmm3 = zmm8[0],zmm3[0],zmm8[2],zmm3[2],zmm8[4],zmm3[5],zmm8[6],zmm3[6]
vmovaps %ymm13, %ymm8
vpermt2ps %ymm11, %ymm20, %ymm8
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7]
vmovsd .LCPI0_195(%rip), %xmm8 # xmm8 = [3,7,0,0]
vmovaps %ymm14, %ymm29
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0
vmovaps %xmm7, %xmm3
vmovupd %zmm0, 160(%rsp) # 64-byte Spill
vunpckhps %xmm18, %xmm21, %xmm0 # xmm0 = xmm21[2],xmm18[2],xmm21[3],xmm18[3]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vpermt2ps %xmm27, %xmm8, %xmm3
vshufps $51, %xmm4, %xmm28, %xmm8 # xmm8 = xmm28[3,0],xmm4[3,0]
vinsertf128 $1, %xmm3, %ymm0, %ymm3
vblendps $192, %ymm0, %ymm3, %ymm0 # ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
vunpckhps %xmm17, %xmm1, %xmm3 # xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3]
vshufps $226, %xmm3, %xmm8, %xmm3 # xmm3 = xmm8[2,0],xmm3[2,3]
vbroadcastsd .LCPI0_30(%rip), %ymm8 # ymm8 = [5,13,5,13,5,13,5,13]
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
vmovaps .LCPI0_31(%rip), %ymm3 # ymm3 = [1,9,2,3,5,13,u,u]
vmovaps %ymm0, %ymm20
vmovaps %ymm7, %ymm0
vpermt2ps %ymm27, %ymm3, %ymm0
vunpcklps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[0],ymm18[0],ymm21[1],ymm18[1],ymm21[4],ymm18[4],ymm21[5],ymm18[5]
vblendps $192, %ymm3, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
vmovaps %ymm1, %ymm3
vpermt2ps %ymm17, %ymm8, %ymm3
vmovaps %ymm4, %ymm8
vpermt2ps %ymm28, %ymm31, %ymm8
vmovaps .LCPI0_27(%rip), %ymm31 # ymm31 = [1,9,2,3,5,13,6,7]
vextractf128 $1, %ymm8, %xmm8
vblendps $3, %xmm8, %xmm3, %xmm3 # xmm3 = xmm8[0,1],xmm3[2,3]
vmovups 2096(%rsp), %zmm8 # 64-byte Reload
vblendps $15, %ymm3, %ymm0, %ymm0 # ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
vmovaps %zmm24, %zmm3
vpermt2ps %ymm5, %ymm31, %ymm29
vmovapd .LCPI0_16(%rip), %zmm31 # zmm31 = [2,10,2,10,6,15,6,14]
vinsertf64x4 $1, %ymm29, %zmm0, %zmm29
vpermt2ps %zmm23, %zmm8, %zmm3
vunpcklps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[0],ymm26[0],ymm2[1],ymm26[1],ymm2[4],ymm26[4],ymm2[5],ymm26[5]
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3]
vpermt2pd %zmm25, %zmm31, %zmm19
vpermt2pd %zmm9, %zmm31, %zmm6
vmovaps %zmm24, %zmm9
vmovaps %zmm14, %zmm25
vshufpd $32, %zmm8, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[5],zmm3[6],zmm8[6]
vunpcklps %ymm11, %ymm13, %ymm8 # ymm8 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
vinsertf64x4 $1, %ymm8, %zmm0, %zmm8
vshufpd $128, %zmm8, %zmm29, %zmm3 {%k4} # zmm3 {%k4} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[7]
vbroadcastsd .LCPI0_32(%rip), %zmm8 # zmm8 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19]
vmovups 1584(%rsp), %zmm29 # 64-byte Reload
vinsertf64x4 $0, %ymm0, %zmm3, %zmm0
vmovaps %zmm24, %zmm3
vmovupd %zmm0, 1072(%rsp) # 64-byte Spill
vmovupd 2544(%rsp), %zmm0 # 64-byte Reload
vpermt2ps %zmm23, %zmm8, %zmm3
vshufpd $128, %zmm16, %zmm0, %zmm19 {%k4} # zmm19 {%k4} = zmm0[0],zmm16[0],zmm0[2],zmm16[2],zmm0[4],zmm16[4],zmm0[6],zmm16[7]
vmovaps %zmm2, %zmm0
vpermt2ps %zmm26, %zmm8, %zmm0
vmovaps %zmm8, %zmm16
vshufpd $32, %zmm0, %zmm3, %zmm3 # zmm3 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[5],zmm3[6],zmm0[6]
vmovaps %zmm13, %zmm0
vpermt2ps %zmm11, %zmm8, %zmm0
vmovaps %zmm14, %zmm8
vpermt2ps %zmm5, %zmm16, %zmm8
vmovaps .LCPI0_10(%rip), %ymm16 # ymm16 = [3,11,2,3,7,15,6,7]
vshufpd $128, %zmm0, %zmm8, %zmm3 {%k4} # zmm3 {%k4} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[7]
vmovupd 2480(%rsp), %zmm0 # 64-byte Reload
vmovups 2352(%rsp), %zmm8 # 64-byte Reload
vshufpd $128, %zmm12, %zmm0, %zmm6 {%k4} # zmm6 {%k4} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[7]
vmovaps %zmm24, %zmm0
vpermt2ps %zmm23, %zmm8, %zmm24
vunpckhps %ymm26, %ymm2, %ymm8 # ymm8 = ymm2[2],ymm26[2],ymm2[3],ymm26[3],ymm2[6],ymm26[6],ymm2[7],ymm26[7]
vunpckhps %ymm11, %ymm13, %ymm12 # ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
vshuff64x2 $85, %zmm8, %zmm8, %zmm8 # zmm8 = zmm8[2,3,2,3,2,3,2,3]
vinsertf64x4 $1, %ymm12, %zmm0, %zmm12
vshufpd $32, %zmm8, %zmm24, %zmm8 # zmm8 = zmm24[0],zmm8[0],zmm24[2],zmm8[2],zmm24[4],zmm8[5],zmm24[6],zmm8[6]
vmovaps %zmm14, %zmm24
vpermt2ps %ymm5, %ymm16, %ymm14
vmovaps .LCPI0_18(%rip), %zmm16 # zmm16 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15]
vinsertf64x4 $1, %ymm14, %zmm0, %zmm14
vshufpd $128, %zmm12, %zmm14, %zmm8 {%k4} # zmm8 {%k4} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[7]
vunpcklps %zmm11, %zmm13, %zmm12 # zmm12 = zmm13[0],zmm11[0],zmm13[1],zmm11[1],zmm13[4],zmm11[4],zmm13[5],zmm11[5],zmm13[8],zmm11[8],zmm13[9],zmm11[9],zmm13[12],zmm11[12],zmm13[13],zmm11[13]
vunpckhps %zmm11, %zmm13, %zmm11 # zmm11 = zmm13[2],zmm11[2],zmm13[3],zmm11[3],zmm13[6],zmm11[6],zmm13[7],zmm11[7],zmm13[10],zmm11[10],zmm13[11],zmm11[11],zmm13[14],zmm11[14],zmm13[15],zmm11[15]
vunpcklps %zmm26, %zmm2, %zmm13 # zmm13 = zmm2[0],zmm26[0],zmm2[1],zmm26[1],zmm2[4],zmm26[4],zmm2[5],zmm26[5],zmm2[8],zmm26[8],zmm2[9],zmm26[9],zmm2[12],zmm26[12],zmm2[13],zmm26[13]
vunpckhps %zmm26, %zmm2, %zmm2 # zmm2 = zmm2[2],zmm26[2],zmm2[3],zmm26[3],zmm2[6],zmm26[6],zmm2[7],zmm26[7],zmm2[10],zmm26[10],zmm2[11],zmm26[11],zmm2[14],zmm26[14],zmm2[15],zmm26[15]
vinsertf64x4 $0, %ymm20, %zmm3, %zmm26
vunpckhps %ymm18, %ymm21, %ymm3 # ymm3 = ymm21[2],ymm18[2],ymm21[3],ymm18[3],ymm21[6],ymm18[6],ymm21[7],ymm18[7]
vmovapd %zmm22, %zmm20
vmovups 1648(%rsp), %zmm21 # 64-byte Reload
vpermt2ps %zmm23, %zmm16, %zmm0
vpermt2ps %zmm5, %zmm16, %zmm24
vmovapd .LCPI0_20(%rip), %zmm16 # zmm16 = [0,8,0,8,4,12,4,13]
vshufpd $32, %zmm13, %zmm0, %zmm14 # zmm14 = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[5],zmm0[6],zmm13[6]
vpermt2pd %zmm13, %zmm31, %zmm0
vbroadcastsd .LCPI0_13(%rip), %ymm13 # ymm13 = [7,15,7,15,7,15,7,15]
vshufpd $128, %zmm12, %zmm24, %zmm0 {%k4} # zmm0 {%k4} = zmm24[0],zmm12[0],zmm24[2],zmm12[2],zmm24[4],zmm12[4],zmm24[6],zmm12[7]
vpermt2pd %zmm12, %zmm16, %zmm24
vmovaps .LCPI0_15(%rip), %zmm12 # zmm12 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15]
vpermt2ps %ymm17, %ymm13, %ymm1
vmovups 560(%rsp), %zmm17 # 64-byte Reload
vmovapd %zmm24, %zmm14 {%k4}
vmovups 1008(%rsp), %zmm24 # 64-byte Reload
vpermt2ps %zmm23, %zmm12, %zmm9
vpermt2ps %zmm5, %zmm12, %zmm25
vmovapd %ymm30, %ymm5
vblendpd $3, 1424(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
# ymm5 = mem[0,1],ymm5[2,3]
vmovups 1712(%rsp), %zmm23 # 64-byte Reload
vmovups 944(%rsp), %zmm30 # 64-byte Reload
vshufpd $32, %zmm2, %zmm9, %zmm12 # zmm12 = zmm9[0],zmm2[0],zmm9[2],zmm2[2],zmm9[4],zmm2[5],zmm9[6],zmm2[6]
vpermt2pd %zmm2, %zmm31, %zmm9
vmovupd 2416(%rsp), %ymm2 # 32-byte Reload
vmovups 32(%rsp), %zmm31 # 64-byte Reload
vblendpd $3, 2448(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
# ymm2 = mem[0,1],ymm2[2,3]
vshufpd $128, %zmm11, %zmm25, %zmm9 {%k4} # zmm9 {%k4} = zmm25[0],zmm11[0],zmm25[2],zmm11[2],zmm25[4],zmm11[4],zmm25[6],zmm11[7]
vpermt2pd %zmm11, %zmm16, %zmm25
vmovaps .LCPI0_14(%rip), %ymm11 # ymm11 = [3,11,2,3,7,15,u,u]
vmovapd %zmm25, %zmm12 {%k4}
vinsertf64x4 $0, %ymm2, %zmm12, %zmm18
vextractf64x4 $1, %zmm10, %ymm2
vmovups 688(%rsp), %zmm10 # 64-byte Reload
vmovups 416(%rsp), %zmm12 # 64-byte Reload
vmovups 1456(%rsp), %zmm25 # 64-byte Reload
vpermt2ps %ymm27, %ymm11, %ymm7
vmovups 752(%rsp), %ymm11 # 32-byte Reload
vinsertf64x4 $0, %ymm5, %zmm14, %zmm27
vblendps $192, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
vmovups 1136(%rsp), %zmm7 # 64-byte Reload
vpermt2ps %ymm28, %ymm11, %ymm4
vmovups 1328(%rsp), %zmm11 # 64-byte Reload
vmovups 288(%rsp), %zmm28 # 64-byte Reload
vextractf128 $1, %ymm4, %xmm4
vblendps $3, %xmm4, %xmm1, %xmm1 # xmm1 = xmm4[0,1],xmm1[2,3]
vmovupd 3120(%rsp), %zmm4 # 64-byte Reload
vblendps $15, %ymm1, %ymm3, %ymm1 # ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
vmovupd 816(%rsp), %zmm3 # 64-byte Reload
vinsertf64x4 $0, %ymm1, %zmm8, %zmm13
vmovupd 352(%rsp), %zmm1 # 64-byte Reload
vmovups 96(%rsp), %zmm8 # 64-byte Reload
vextractf32x4 $3, %zmm4, %xmm4
vextractf64x4 $1, %zmm3, %ymm3
vextractf64x4 $1, %zmm1, %ymm1
vblendpd $8, %ymm1, %ymm3, %ymm1 # ymm1 = ymm3[0,1,2],ymm1[3]
vmovupd 3184(%rsp), %zmm3 # 64-byte Reload
vextractf32x4 $3, %zmm3, %xmm3
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1]
vmovupd 2736(%rsp), %zmm4 # 64-byte Reload
vblendpd $3, %ymm3, %ymm1, %ymm1 # ymm1 = ymm3[0,1],ymm1[2,3]
vmovupd 2992(%rsp), %zmm3 # 64-byte Reload
vinsertf64x4 $0, %ymm1, %zmm19, %zmm14
vmovupd 3376(%rsp), %zmm1 # 64-byte Reload
vextractf32x4 $3, %zmm4, %xmm4
vextractf32x4 $3, %zmm3, %xmm3
vextractf64x4 $1, %zmm1, %ymm1
vblendpd $8, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2],ymm1[3]
vmovupd 3056(%rsp), %zmm2 # 64-byte Reload
vextractf32x4 $3, %zmm2, %xmm2
vblendpd $1, %xmm3, %xmm2, %xmm2 # xmm2 = xmm3[0],xmm2[1]
vmovupd 2864(%rsp), %zmm3 # 64-byte Reload
vblendpd $3, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0,1],ymm1[2,3]
vmovupd 2928(%rsp), %zmm2 # 64-byte Reload
vinsertf64x4 $0, %ymm1, %zmm0, %zmm19
vmovaps .LCPI0_120(%rip), %ymm0 # ymm0 = [0,1,2,3,13,u,u,u]
vmovups 224(%rsp), %ymm1 # 32-byte Reload
vextractf64x4 $1, %zmm3, %ymm3
vextractf64x4 $1, %zmm2, %ymm2
vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3]
vmovupd 2800(%rsp), %zmm3 # 64-byte Reload
vextractf32x4 $3, %zmm3, %xmm3
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1]
vmovupd 2608(%rsp), %zmm4 # 64-byte Reload
vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3]
vextractf64x4 $1, %zmm15, %ymm3
vmovups 1264(%rsp), %zmm15 # 64-byte Reload
vinsertf64x4 $0, %ymm2, %zmm6, %zmm22
vmovupd 3312(%rsp), %zmm2 # 64-byte Reload
vmovaps .LCPI0_122(%rip), %ymm6 # ymm6 = [0,1,2,3,4,5,6,13]
vextractf32x4 $3, %zmm4, %xmm4
vpermt2ps %ymm15, %ymm0, %ymm1
vextractf64x4 $1, %zmm2, %ymm2
vblendpd $8, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1,2],ymm2[3]
vmovupd 2672(%rsp), %zmm3 # 64-byte Reload
vblendps $32, %ymm10, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3,4],ymm10[5],ymm1[6,7]
vmovaps .LCPI0_121(%rip), %ymm1 # ymm1 = [0,1,2,3,4,5,13,u]
vextractf32x4 $3, %zmm3, %xmm3
vpermt2ps %ymm8, %ymm1, %ymm0
vmovups 624(%rsp), %zmm1 # 64-byte Reload
vblendpd $1, %xmm4, %xmm3, %xmm3 # xmm3 = xmm4[0],xmm3[1]
vmovaps .LCPI0_123(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,21,u,u,u,u,u,u,u]
vblendpd $3, %ymm3, %ymm2, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3]
vmovaps .LCPI0_124(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,21,u,u,u,u,u,u]
vinsertf64x4 $0, %ymm2, %zmm9, %zmm16
vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_106(%rip){1to16}, %zmm9, %k1
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %ymm1, %ymm6, %ymm0
vpermt2ps %zmm21, %zmm4, %zmm0
vmovaps .LCPI0_125(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,21,u,u,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm2 {%k1} {z}
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_7(%rip){1to16}, %zmm9, %k1
vmovups 1840(%rsp), %zmm9 # 64-byte Reload
movq -48(%rsp), %rax # 8-byte Reload
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm29, %zmm3, %zmm0
vmovaps .LCPI0_126(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,21,u,u,u,u]
vpermt2ps %zmm23, %zmm4, %zmm0
vmovaps .LCPI0_127(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,21,u,u,u]
vpermt2ps %zmm7, %zmm3, %zmm0
vmovaps .LCPI0_128(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,21,u,u]
vpermt2ps %zmm11, %zmm4, %zmm0
vmovaps .LCPI0_129(%rip), %zmm4 # zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,21,u]
vpermt2ps %zmm28, %zmm3, %zmm0
vmovaps .LCPI0_130(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,21]
vpermt2ps %zmm12, %zmm4, %zmm0
vpermt2ps %zmm17, %zmm3, %zmm0
vshufps $255, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
vfmadd231ps %zmm4, %zmm0, %zmm18 # zmm18 = (zmm0 * zmm4) + zmm18
vshufps $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vfmadd231ps %zmm3, %zmm0, %zmm16 # zmm16 = (zmm0 * zmm3) + zmm16
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm4, %zmm4, %zmm3 # zmm3 = zmm4[6,7,6,7,6,7,6,7]
vshufps $255, %xmm2, %xmm2, %xmm4 # xmm4 = xmm2[3,3,3,3]
vfmadd231ps %zmm5, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm5) + zmm20
vshufps $85, %zmm2, %zmm2, %zmm5 # zmm5 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
vfmadd231ps %zmm3, %zmm0, %zmm22 # zmm22 = (zmm0 * zmm3) + zmm22
vshuff64x2 $170, %zmm5, %zmm5, %zmm6 # zmm6 = zmm5[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm5, %zmm5, %zmm3 # zmm3 = zmm5[6,7,6,7,6,7,6,7]
vmovups %zmm16, 944(%rsp) # 64-byte Spill
vmovaps 2032(%rsp), %xmm16 # 16-byte Reload
vmovaps .LCPI0_132(%rip), %ymm5 # ymm5 = [0,1,2,3,4,14,u,u]
vfmadd231ps %zmm6, %zmm0, %zmm27 # zmm27 = (zmm0 * zmm6) + zmm27
vextractf32x4 $2, %zmm2, %xmm6
vfmadd231ps %zmm3, %zmm0, %zmm19 # zmm19 = (zmm0 * zmm3) + zmm19
vextractf32x4 $3, %zmm2, %xmm3
vmovups %zmm20, 816(%rsp) # 64-byte Spill
vmovups 1520(%rsp), %zmm20 # 64-byte Reload
vbroadcastss %xmm6, %zmm6
vbroadcastss %xmm3, %zmm3
vfmadd231ps %zmm6, %zmm0, %zmm30 # zmm30 = (zmm0 * zmm6) + zmm30
vshufps $170, %xmm2, %xmm2, %xmm6 # xmm6 = xmm2[2,2,2,2]
vfmadd231ps %zmm3, %zmm0, %zmm14 # zmm14 = (zmm0 * zmm3) + zmm14
vshufps $255, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[3,3,3,3,7,7,7,7]
vbroadcastsd %xmm6, %zmm6
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3]
vfmadd231ps %zmm6, %zmm0, %zmm31 # zmm31 = (zmm0 * zmm6) + zmm31
vmovshdup %xmm2, %xmm6 # xmm6 = xmm2[1,1,3,3]
vfmadd231ps %zmm3, %zmm0, %zmm13 # zmm13 = (zmm0 * zmm3) + zmm13
vshufps $85, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[1,1,1,1,5,5,5,5]
vbroadcastsd %xmm6, %zmm6
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3]
vmovups %zmm14, 352(%rsp) # 64-byte Spill
vmovaps %zmm11, %zmm14
vfmadd231ps %zmm6, %zmm0, %zmm20 # zmm20 = (zmm0 * zmm6) + zmm20
vbroadcastss %xmm2, %zmm6
vmovups %zmm13, 32(%rsp) # 64-byte Spill
vmovups 1776(%rsp), %zmm13 # 64-byte Reload
vfmadd231ps %zmm6, %zmm0, %zmm24 # zmm24 = (zmm0 * zmm6) + zmm24
vmovups 1072(%rsp), %zmm6 # 64-byte Reload
vfmadd231ps %zmm3, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm3) + zmm6
vbroadcastsd %xmm4, %zmm3
vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26
vshufps $170, %ymm2, %ymm2, %ymm3 # ymm3 = ymm2[2,2,2,2,6,6,6,6]
vextractf128 $1, %ymm2, %xmm2
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3]
vbroadcastss %xmm2, %zmm2
vmovups %zmm26, 224(%rsp) # 64-byte Spill
vmovups 160(%rsp), %zmm26 # 64-byte Reload
vfmadd231ps %zmm2, %zmm0, %zmm25 # zmm25 = (zmm0 * zmm2) + zmm25
vmovapd .LCPI0_131(%rip), %ymm2 # ymm2 = [0,1,7,u]
vfmadd231ps %zmm3, %zmm0, %zmm26 # zmm26 = (zmm0 * zmm3) + zmm26
vunpckhps %ymm13, %ymm9, %ymm0 # ymm0 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7]
vextractf128 $1, %ymm0, %xmm0
vblendps $12, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
# xmm0 = xmm0[0,1],mem[2,3]
vinsertps $176, %xmm16, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm16[2]
vpermt2pd %ymm15, %ymm2, %ymm0
vmovaps %zmm23, %zmm15
vpermt2ps %ymm10, %ymm5, %ymm0
vblendps $192, %ymm8, %ymm0, %ymm10 # ymm10 = ymm0[0,1,2,3,4,5],ymm8[6,7]
vmovaps .LCPI0_133(%rip), %ymm0 # ymm0 = [0,1,2,3,4,5,6,14]
vmovaps %zmm12, %zmm8
vpermt2ps %ymm1, %ymm0, %ymm10
vmovapd .LCPI0_134(%rip), %zmm0 # zmm0 = [0,1,2,3,11,u,u,u]
vpermt2pd %zmm21, %zmm0, %zmm10
vmovaps .LCPI0_135(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,22,u,u,u,u,u,u]
vmovups 816(%rsp), %zmm21 # 64-byte Reload
vpermt2ps %zmm29, %zmm0, %zmm10
vmovapd .LCPI0_136(%rip), %zmm0 # zmm0 = [0,1,2,3,4,11,u,u]
vpermt2pd %zmm23, %zmm0, %zmm10
vmovaps .LCPI0_137(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,22,u,u,u,u]
vmovaps %zmm9, %zmm23
vpermt2ps %zmm7, %zmm0, %zmm10
vmovapd .LCPI0_138(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,11,u]
vpermt2pd %zmm11, %zmm0, %zmm10
vmovaps .LCPI0_139(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,22,u,u]
vmovups 352(%rsp), %zmm11 # 64-byte Reload
vpermt2ps %zmm28, %zmm0, %zmm10
vmovapd .LCPI0_140(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,11]
vmovaps %zmm17, %zmm28
vpermt2pd %zmm12, %zmm0, %zmm10
vmovaps .LCPI0_141(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,22]
vpermt2ps %zmm17, %zmm0, %zmm10
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm0 {%k1} {z}
vmovaps %zmm27, %zmm17
vmovups 32(%rsp), %zmm27 # 64-byte Reload
movq 8(%rsp), %rax # 8-byte Reload
vbroadcastss %xmm0, %zmm2
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vshufps $170, %xmm0, %xmm0, %xmm3 # xmm3 = xmm0[2,2,2,2]
vextractf128 $1, %ymm0, %xmm7
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm1, %zmm2
vextractf32x4 $2, %zmm0, %xmm1
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm2) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm3, %zmm2
vmovups %zmm24, 1008(%rsp) # 64-byte Spill
vmovaps %zmm31, %zmm24
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm10, %zmm24 # zmm24 = (zmm10 * zmm2) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm1, %zmm2
vshufps $255, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[3,3,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm2) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
vmovups %zmm20, 1520(%rsp) # 64-byte Spill
vmovups 224(%rsp), %zmm20 # 64-byte Reload
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm10, %zmm17 # zmm17 = (zmm10 * zmm3) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vmovaps %zmm30, %zmm31
vmovaps %zmm6, %zmm30
vmovaps %zmm13, %zmm6
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm10, %zmm21 # zmm21 = (zmm10 * zmm4) + zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm10, %zmm18 # zmm18 = (zmm10 * zmm5) + zmm18
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm7, %zmm5
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vunpcklps %zmm13, %zmm9, %zmm7 # zmm7 = zmm9[0],zmm13[0],zmm9[1],zmm13[1],zmm9[4],zmm13[4],zmm9[5],zmm13[5],zmm9[8],zmm13[8],zmm9[9],zmm13[9],zmm9[12],zmm13[12],zmm9[13],zmm13[13]
vfmadd231ps %zmm5, %zmm10, %zmm25 # zmm25 = (zmm10 * zmm5) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3]
vmovaps %zmm18, %zmm29
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vunpckhps %zmm13, %zmm9, %zmm18 # zmm18 = zmm9[2],zmm13[2],zmm9[3],zmm13[3],zmm9[6],zmm13[6],zmm9[7],zmm13[7],zmm9[10],zmm13[10],zmm9[11],zmm13[11],zmm9[14],zmm13[14],zmm9[15],zmm13[15]
vfmadd231ps %zmm5, %zmm10, %zmm26 # zmm26 = (zmm10 * zmm5) + zmm26
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm1, %zmm5
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps %zmm9, %zmm1
vmovups %zmm25, 1456(%rsp) # 64-byte Spill
vmovups 1584(%rsp), %zmm25 # 64-byte Reload
vfmadd231ps %zmm5, %zmm10, %zmm20 # zmm20 = (zmm10 * zmm5) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm10, %zmm30 # zmm30 = (zmm10 * zmm5) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm0, %ymm0, %ymm5 # ymm5 = ymm0[3,3,3,3,7,7,7,7]
vextractf32x4 $3, %zmm0, %xmm0
vshuff64x2 $85, %zmm5, %zmm5, %zmm5 # zmm5 = zmm5[2,3,2,3,2,3,2,3]
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm10, %zmm27 # zmm27 = (zmm10 * zmm5) + zmm27
vfmadd231ps %zmm0, %zmm10, %zmm11 # zmm11 = (zmm10 * zmm0) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7]
vmovups 752(%rsp), %ymm2 # 32-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm10, %zmm19 # zmm19 = (zmm10 * zmm0) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_144(%rip), %ymm3 # ymm3 = [0,1,2,3,4,5,15,u]
vfmadd231ps %zmm0, %zmm10, %zmm22 # zmm22 = (zmm10 * zmm0) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm4, %zmm4, %zmm0 # zmm0 = zmm4[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps 944(%rsp), %zmm0, %zmm10 # 64-byte Folded Reload
# zmm10 = (zmm0 * zmm10) + mem
vmovaps .LCPI0_143(%rip), %ymm4 # ymm4 = [0,1,2,3,4,15,u,u]
vpermt2ps %ymm13, %ymm2, %ymm9
vmovaps %xmm16, %xmm13
vmovaps .LCPI0_142(%rip), %ymm2 # ymm2 = [0,1,2,3,15,u,u,u]
vmovups 1648(%rsp), %zmm16 # 64-byte Reload
vextractf128 $1, %ymm9, %xmm0
vshufps $244, 880(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
# xmm0 = xmm0[0,1],mem[3,3]
vmovdqu64 1200(%rsp), %zmm9 # 64-byte Reload
.loc 1 222 51 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_105(%rip){1to16}, %zmm9, %k1
vpcmpgtd .LCPI0_2(%rip){1to16}, %zmm9, %k2
vpcmpgtd .LCPI0_3(%rip){1to16}, %zmm9, %k5
vpcmpgtd .LCPI0_4(%rip){1to16}, %zmm9, %k6
vpcmpgtd .LCPI0_5(%rip){1to16}, %zmm9, %k7
vpcmpgtd .LCPI0_6(%rip){1to16}, %zmm9, %k3
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vblendps $8, %xmm13, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm13[3]
vmovups 1264(%rsp), %zmm13 # 64-byte Reload
vpermt2ps %ymm13, %ymm2, %ymm0
vpermt2ps 688(%rsp), %ymm4, %ymm0 # 32-byte Folded Reload
vmovaps .LCPI0_146(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,23,u,u,u,u,u,u]
vpermt2ps 96(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
vmovaps .LCPI0_145(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,23,u,u,u,u,u,u,u]
vblendps $128, 624(%rsp), %ymm0, %ymm12 # 32-byte Folded Reload
# ymm12 = ymm0[0,1,2,3,4,5,6],mem[7]
vmovaps .LCPI0_149(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,23,u,u,u]
vpermt2ps %zmm16, %zmm3, %zmm12
vmovaps .LCPI0_147(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,23,u,u,u,u,u]
vpermt2ps %zmm25, %zmm2, %zmm12
vmovaps .LCPI0_148(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,23,u,u,u,u]
vpermt2ps %zmm15, %zmm3, %zmm12
vmovaps .LCPI0_150(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,23,u,u]
vmovups 1520(%rsp), %zmm15 # 64-byte Reload
vpermt2ps 1136(%rsp), %zmm2, %zmm12 # 64-byte Folded Reload
vmovaps .LCPI0_152(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,23]
vpermt2ps %zmm14, %zmm0, %zmm12
vpermt2ps 288(%rsp), %zmm3, %zmm12 # 64-byte Folded Reload
vmovaps .LCPI0_151(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,23,u]
vmovups 1008(%rsp), %zmm14 # 64-byte Reload
vpermt2ps %zmm8, %zmm0, %zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm0 {%k1} {z}
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_103(%rip){1to16}, %zmm9, %k1
vmovaps %zmm31, %zmm8
vmovups 1136(%rsp), %zmm31 # 64-byte Reload
movq -40(%rsp), %rax # 8-byte Reload
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm2, %zmm12
vmovupd 1328(%rsp), %zmm28 # 64-byte Reload
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
kmovw %k1, 1008(%rsp) # 2-byte Spill
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_1(%rip){1to16}, %zmm9, %k1
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm12, %zmm29 # zmm29 = (zmm12 * zmm3) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm10 # zmm10 = (zmm12 * zmm2) + zmm10
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm2 # zmm2 = zmm3[6,7,6,7,6,7,6,7]
vextractf32x4 $3, %zmm0, %xmm3
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm12, %zmm21 # zmm21 = (zmm12 * zmm4) + zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm22 # zmm22 = (zmm12 * zmm2) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7]
vshuff64x2 $170, %zmm4, %zmm4, %zmm5 # zmm5 = zmm4[4,5,4,5,4,5,4,5]
vshufps $255, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[3,3,3,3]
kmovw %k1, 880(%rsp) # 2-byte Spill
.loc 1 222 51 is_stmt 0 # 03-matrix-multiplication-cpu.py:222:51
vpcmpgtd .LCPI0_104(%rip){1to16}, %zmm9, %k1
.loc 1 224 35 is_stmt 1 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm19 # zmm19 = (zmm12 * zmm2) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm3, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_18(%rip), %zmm3 # zmm3 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15]
vfmadd231ps %zmm5, %zmm12, %zmm17 # zmm17 = (zmm12 * zmm5) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $2, %zmm0, %xmm5
vmovups %zmm21, 816(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm11 # zmm11 = (zmm12 * zmm2) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7]
vbroadcastss %xmm5, %zmm5
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vmovaps %zmm17, %zmm21
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps %zmm7, %zmm17
vfmadd231ps %zmm5, %zmm12, %zmm8 # zmm8 = (zmm12 * zmm5) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm0, %xmm0, %xmm5 # xmm5 = xmm0[2,2,2,2]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm27 # zmm27 = (zmm12 * zmm2) + zmm27
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5]
vbroadcastsd %xmm5, %zmm5
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vmovups %zmm11, 352(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm12, %zmm24 # zmm24 = (zmm12 * zmm5) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm0, %xmm5 # xmm5 = xmm0[1,1,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm30 # zmm30 = (zmm12 * zmm2) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm4, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm6, %zmm3, %zmm1
vmovups 1968(%rsp), %zmm3 # 64-byte Reload
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm5, %zmm5
vmovups %zmm27, 32(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm20 # zmm20 = (zmm12 * zmm2) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[2,2,2,2,6,6,6,6]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm5) + zmm15
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm0, %zmm5
vextractf128 $1, %ymm0, %xmm0
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm12, %zmm14 # zmm14 = (zmm12 * zmm5) + zmm14
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm0, %zmm0
vmovups 1904(%rsp), %zmm5 # 64-byte Reload
vmovups %zmm30, 1072(%rsp) # 64-byte Spill
vmovupd 1712(%rsp), %zmm30 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm12, %zmm26 # zmm26 = (zmm12 * zmm2) + zmm26
vmovaps .LCPI0_15(%rip), %zmm2 # zmm2 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15]
vmovaps %zmm1, %zmm11
vfmadd213ps 1456(%rsp), %zmm0, %zmm12 # 64-byte Folded Reload
# zmm12 = (zmm0 * zmm12) + mem
vmovaps .LCPI0_34(%rip), %xmm0 # xmm0 = [8,9,25,u]
vmovaps %zmm26, %zmm27
vmovups 288(%rsp), %zmm26 # 64-byte Reload
vpermt2ps %zmm6, %zmm2, %zmm23
vmovapd .LCPI0_58(%rip), %xmm2 # xmm2 = [6,14]
vmovaps %zmm18, %zmm6
vpermt2ps %zmm3, %zmm0, %zmm11
vmovapd .LCPI0_41(%rip), %xmm0 # xmm0 = [4,13]
vmovaps %zmm23, %zmm9
vpermt2pd %zmm3, %zmm2, %zmm17
vmovaps .LCPI0_67(%rip), %xmm2 # xmm2 = [12,13,29,u]
vpermt2pd %zmm3, %zmm0, %zmm6
vmovaps .LCPI0_49(%rip), %xmm0 # xmm0 = [8,9,27,u]
vpermt2ps %zmm3, %zmm2, %zmm1
vmovapd .LCPI0_78(%rip), %xmm2 # xmm2 = [6,15]
vpermt2ps %zmm3, %zmm0, %zmm9
vmovaps .LCPI0_73(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,29,u,u,u,u,u,u,u]
vpermt2pd %zmm3, %zmm2, %zmm18
vmovaps .LCPI0_90(%rip), %xmm2 # xmm2 = [12,13,31,u]
vpermt2ps %zmm3, %zmm2, %zmm23
vmovapd .LCPI0_153(%rip), %xmm2 # xmm2 = [4,12]
vpermt2pd %zmm3, %zmm2, %zmm7
vmovaps .LCPI0_35(%rip), %xmm2 # xmm2 = [0,1,2,25]
vmovups 688(%rsp), %zmm3 # 64-byte Reload
vpermt2ps %zmm5, %zmm2, %zmm11
vmovaps .LCPI0_42(%rip), %xmm2 # xmm2 = [0,1,2,26]
vpermt2ps %zmm5, %zmm2, %zmm6
vmovaps .LCPI0_59(%rip), %xmm2 # xmm2 = [0,1,2,28]
vpermt2ps %zmm5, %zmm2, %zmm17
vmovaps .LCPI0_68(%rip), %xmm2 # xmm2 = [0,1,2,29]
vpermt2ps %zmm5, %zmm2, %zmm1
vmovaps .LCPI0_79(%rip), %xmm2 # xmm2 = [0,1,2,30]
vpermt2ps %zmm5, %zmm2, %zmm18
vmovaps .LCPI0_91(%rip), %xmm2 # xmm2 = [0,1,2,31]
vpermt2ps %zmm5, %zmm2, %zmm23
vmovaps .LCPI0_50(%rip), %xmm2 # xmm2 = [0,1,2,27]
vpermt2ps %zmm5, %zmm2, %zmm9
vmovaps .LCPI0_154(%rip), %xmm2 # xmm2 = [0,1,2,24]
vpermt2ps %zmm5, %zmm2, %zmm7
vmovaps .LCPI0_36(%rip), %ymm2 # ymm2 = [0,1,2,3,25,u,u,u]
vmovups 624(%rsp), %zmm5 # 64-byte Reload
vpermt2ps %zmm13, %zmm2, %zmm11
vmovapd .LCPI0_43(%rip), %ymm2 # ymm2 = [0,1,13,u]
vpermt2pd %zmm13, %zmm2, %zmm6
vmovapd .LCPI0_60(%rip), %ymm2 # ymm2 = [0,1,14,u]
vpermt2pd %zmm13, %zmm2, %zmm17
vmovaps .LCPI0_69(%rip), %ymm2 # ymm2 = [0,1,2,3,29,u,u,u]
vpermt2ps %zmm13, %zmm2, %zmm1
vmovaps .LCPI0_51(%rip), %ymm2 # ymm2 = [0,1,2,3,27,u,u,u]
vpermt2ps %zmm13, %zmm2, %zmm9
vmovapd .LCPI0_80(%rip), %ymm2 # ymm2 = [0,1,15,u]
vpermt2pd %zmm13, %zmm2, %zmm18
vmovaps .LCPI0_92(%rip), %ymm2 # ymm2 = [0,1,2,3,31,u,u,u]
vpermt2ps %zmm13, %zmm2, %zmm23
vmovapd .LCPI0_155(%rip), %ymm2 # ymm2 = [0,1,12,u]
vpermt2pd %zmm13, %zmm2, %zmm7
vmovaps .LCPI0_37(%rip), %ymm2 # ymm2 = [0,1,2,3,4,25,u,u]
vmovups 96(%rsp), %zmm13 # 64-byte Reload
vpermt2ps %zmm3, %zmm2, %zmm11
vmovaps .LCPI0_44(%rip), %ymm2 # ymm2 = [0,1,2,3,4,26,u,u]
vpermt2ps %zmm3, %zmm2, %zmm6
vmovaps .LCPI0_52(%rip), %ymm2 # ymm2 = [0,1,2,3,4,27,u,u]
vpermt2ps %zmm3, %zmm2, %zmm9
vmovaps .LCPI0_61(%rip), %ymm2 # ymm2 = [0,1,2,3,4,28,u,u]
vpermt2ps %zmm3, %zmm2, %zmm17
vmovaps .LCPI0_70(%rip), %ymm2 # ymm2 = [0,1,2,3,4,29,u,u]
vpermt2ps %zmm3, %zmm2, %zmm1
vmovaps .LCPI0_81(%rip), %ymm2 # ymm2 = [0,1,2,3,4,30,u,u]
vpermt2ps %zmm3, %zmm2, %zmm18
vmovaps .LCPI0_93(%rip), %ymm2 # ymm2 = [0,1,2,3,4,31,u,u]
vpermt2ps %zmm3, %zmm2, %zmm23
vmovaps .LCPI0_156(%rip), %ymm2 # ymm2 = [0,1,2,3,4,24,u,u]
vpermt2ps %zmm3, %zmm2, %zmm7
vmovaps .LCPI0_38(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,25,u]
vmovapd .LCPI0_47(%rip), %zmm3 # zmm3 = [0,1,2,3,13,u,u,u]
vpermt2ps %zmm13, %zmm2, %zmm11
vmovapd .LCPI0_45(%rip), %ymm2 # ymm2 = [0,1,2,13]
vpermt2pd %zmm13, %zmm2, %zmm6
vmovapd .LCPI0_62(%rip), %ymm2 # ymm2 = [0,1,2,14]
vpermt2pd %zmm13, %zmm2, %zmm17
vmovaps .LCPI0_71(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,29,u]
vpermt2ps %zmm13, %zmm2, %zmm1
vmovapd .LCPI0_82(%rip), %ymm2 # ymm2 = [0,1,2,15]
vpermt2pd %zmm13, %zmm2, %zmm18
vmovaps .LCPI0_94(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,31,u]
vpermt2ps %zmm13, %zmm2, %zmm23
vmovaps .LCPI0_53(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,27,u]
vpermt2ps %zmm13, %zmm2, %zmm9
vmovapd .LCPI0_157(%rip), %ymm2 # ymm2 = [0,1,2,12]
vpermt2pd %zmm13, %zmm2, %zmm7
vmovaps .LCPI0_39(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,25]
vpermt2ps %zmm5, %zmm2, %zmm11
vmovaps .LCPI0_46(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,26]
vpermt2ps %zmm5, %zmm2, %zmm6
vmovaps .LCPI0_63(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,28]
vpermt2pd %zmm16, %zmm3, %zmm6
vmovaps .LCPI0_48(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,26,u,u,u,u,u,u]
vpermt2ps %zmm5, %zmm2, %zmm17
vmovaps .LCPI0_72(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,29]
vpermt2ps %zmm25, %zmm3, %zmm6
vmovaps .LCPI0_74(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,29,u,u,u,u,u,u]
vpermt2ps %zmm5, %zmm2, %zmm1
vmovaps .LCPI0_54(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,27]
vpermt2ps %zmm16, %zmm0, %zmm1
vmovapd .LCPI0_84(%rip), %zmm0 # zmm0 = [0,1,2,3,15,u,u,u]
vpermt2ps %zmm5, %zmm2, %zmm9
vmovaps .LCPI0_83(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,30]
vpermt2ps %zmm25, %zmm3, %zmm1
vmovaps .LCPI0_56(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,27,u,u,u,u,u,u]
vpermt2ps %zmm5, %zmm2, %zmm18
vmovaps .LCPI0_95(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,31]
vpermt2pd %zmm16, %zmm0, %zmm18
vmovaps .LCPI0_96(%rip), %zmm0 # zmm0 = [0,1,2,3,4,5,6,7,31,u,u,u,u,u,u,u]
vpermt2ps %zmm5, %zmm2, %zmm23
vmovaps .LCPI0_158(%rip), %ymm2 # ymm2 = [0,1,2,3,4,5,6,24]
vpermt2ps %zmm16, %zmm0, %zmm23
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm0 {%k3} {z}
kmovw -114(%rsp), %k3 # 2-byte Reload
movq 16(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm5, %zmm2, %zmm7
vmovaps .LCPI0_40(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,25,u,u,u,u,u,u,u]
vshuff64x2 $228, %zmm16, %zmm7, %zmm13 # zmm13 = zmm7[0,1,2,3],zmm16[4,5,6,7]
vmovaps .LCPI0_64(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,28,u,u,u,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm0, %zmm2
vshufps $170, %xmm0, %xmm0, %xmm4 # xmm4 = xmm0[2,2,2,2]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm16, %zmm5, %zmm11
vmovaps .LCPI0_55(%rip), %zmm5 # zmm5 = [0,1,2,3,4,5,6,7,27,u,u,u,u,u,u,u]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovaps %zmm25, %zmm11 {%k3}
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm16, %zmm5, %zmm9
vshuff64x2 $244, %zmm16, %zmm17, %zmm5 # zmm5 = zmm17[0,1,2,3],zmm16[6,7,6,7]
vmovaps .LCPI0_161(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,24,u,u,u,u]
vmovups 560(%rsp), %zmm17 # 64-byte Reload
vpermt2ps %zmm25, %zmm7, %zmm5
vmovaps .LCPI0_85(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,30,u,u,u,u,u,u]
vpermt2ps %zmm25, %zmm3, %zmm9
vmovaps .LCPI0_159(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,24,u,u,u,u,u,u]
vpermt2ps %zmm25, %zmm7, %zmm18
vmovaps .LCPI0_97(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,31,u,u,u,u,u,u]
vpermt2ps %zmm25, %zmm3, %zmm13
vmovapd .LCPI0_160(%rip), %zmm3 # zmm3 = [0,1,2,3,4,12,u,u]
vpermt2ps %zmm25, %zmm7, %zmm23
vmovapd .LCPI0_162(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,12,u]
vmovupd 416(%rsp), %zmm25 # 64-byte Reload
vpermt2pd %zmm30, %zmm3, %zmm13
vmovaps .LCPI0_165(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,24]
vpermt2ps %zmm31, %zmm16, %zmm13
vmovaps .LCPI0_163(%rip), %zmm16 # zmm16 = [0,1,2,3,4,5,6,7,8,9,10,11,12,24,u,u]
vpermt2pd %zmm28, %zmm7, %zmm13
vmovapd .LCPI0_164(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,12]
vpermt2ps %zmm26, %zmm16, %zmm13
vmovaps %zmm21, %zmm16
vmovups 816(%rsp), %zmm21 # 64-byte Reload
vpermt2pd %zmm25, %zmm7, %zmm13
vpermt2ps %zmm17, %zmm3, %zmm13
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm13, %zmm14 # zmm14 = (zmm13 * zmm2) + zmm14
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm3, %zmm2
vextractf32x4 $2, %zmm0, %xmm3
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm13, %zmm15 # zmm15 = (zmm13 * zmm2) + zmm15
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm4, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm13, %zmm24 # zmm24 = (zmm13 * zmm2) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm3, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm13, %zmm8 # zmm8 = (zmm13 * zmm2) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
vshuff64x2 $170, %zmm2, %zmm2, %zmm3 # zmm3 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm13, %zmm16 # zmm16 = (zmm13 * zmm3) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm13, %zmm19 # zmm19 = (zmm13 * zmm2) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm13, %zmm21 # zmm21 = (zmm13 * zmm4) + zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm29 # zmm29 = (zmm13 * zmm7) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf128 $1, %ymm0, %xmm7
vbroadcastss %xmm7, %zmm7
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm12 # zmm12 = (zmm13 * zmm7) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27
vmovups %zmm27, 160(%rsp) # 64-byte Spill
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %xmm0, %xmm0, %xmm27 # xmm27 = xmm0[3,3,3,3]
vbroadcastsd %xmm27, %zmm7
vmovups 32(%rsp), %zmm27 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3]
vmovups %zmm20, 224(%rsp) # 64-byte Spill
vmovups 1072(%rsp), %zmm20 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm20 # zmm20 = (zmm13 * zmm7) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm0, %ymm0, %ymm7 # ymm7 = ymm0[3,3,3,3,7,7,7,7]
vextractf32x4 $3, %zmm0, %xmm0
vshuff64x2 $85, %zmm7, %zmm7, %zmm7 # zmm7 = zmm7[2,3,2,3,2,3,2,3]
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm7) + zmm27
vmovapd .LCPI0_65(%rip), %zmm7 # zmm7 = [0,1,2,3,4,14,u,u]
vmovups %zmm27, 32(%rsp) # 64-byte Spill
vmovups 352(%rsp), %zmm27 # 64-byte Reload
vpermt2pd %zmm30, %zmm7, %zmm5
vmovapd .LCPI0_86(%rip), %zmm7 # zmm7 = [0,1,2,3,4,15,u,u]
vfmadd231ps %zmm0, %zmm13, %zmm27 # zmm27 = (zmm13 * zmm0) + zmm27
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_75(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,29,u,u,u,u,u]
vfmadd231ps %zmm0, %zmm13, %zmm22 # zmm22 = (zmm13 * zmm0) + zmm22
vfmadd213ps %zmm10, %zmm4, %zmm13 # zmm13 = (zmm4 * zmm13) + zmm10
vmovaps .LCPI0_166(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,25,u,u,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm0 {%k1} {z}
kmovw -116(%rsp), %k1 # 2-byte Reload
movq -24(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2pd %zmm30, %zmm7, %zmm18
vmovaps .LCPI0_167(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,25,u,u,u,u]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovapd %zmm30, %zmm6 {%k1}
kmovw -118(%rsp), %k1 # 2-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm30, %zmm3, %zmm1
vmovaps .LCPI0_98(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,31,u,u,u,u,u]
vpermt2ps %zmm30, %zmm10, %zmm11
vmovaps .LCPI0_169(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,25,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm31, %zmm7, %zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm7 # zmm7 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm30, %zmm3, %zmm23
vmovaps .LCPI0_57(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,27,u,u,u,u,u]
vpermt2ps %zmm30, %zmm3, %zmm9
vmovaps .LCPI0_168(%rip), %zmm30 # zmm30 = [0,1,2,3,4,5,6,7,8,9,10,11,25,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm0, %zmm0, %zmm3 # zmm3 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm30, %zmm11
vpermt2ps %zmm26, %zmm10, %zmm11
vmovaps .LCPI0_170(%rip), %zmm26 # zmm26 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,25,u]
vmovaps .LCPI0_171(%rip), %zmm10 # zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,25]
vpermt2ps %zmm25, %zmm26, %zmm11
vpermt2ps %zmm17, %zmm10, %zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm10 {%k7} {z}
movq 544(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm11, %zmm13 # zmm13 = (zmm11 * zmm3) + zmm13
vmovaps .LCPI0_66(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,28,u,u,u,u]
vfmadd231ps %zmm4, %zmm11, %zmm29 # zmm29 = (zmm11 * zmm4) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm0, %zmm0, %zmm4 # zmm4 = zmm0[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm11, %zmm21 # zmm21 = (zmm11 * zmm7) + zmm21
vfmadd231ps %zmm2, %zmm11, %zmm22 # zmm22 = (zmm11 * zmm2) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm4, %zmm4, %zmm7 # zmm7 = zmm4[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm4, %zmm4, %zmm2 # zmm2 = zmm4[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm7) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $2, %zmm0, %xmm7
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm11, %zmm19 # zmm19 = (zmm11 * zmm2) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $3, %zmm0, %xmm2
vbroadcastss %xmm10, %zmm4
vmovups %zmm29, 96(%rsp) # 64-byte Spill
vbroadcastss %xmm7, %zmm7
vbroadcastss %xmm2, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm11, %zmm8 # zmm8 = (zmm11 * zmm7) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm0, %xmm0, %xmm7 # xmm7 = xmm0[2,2,2,2]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm11, %zmm27 # zmm27 = (zmm11 * zmm2) + zmm27
vmovaps %zmm16, %zmm29
vmovups 32(%rsp), %zmm16 # 64-byte Reload
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[3,3,3,3,7,7,7,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm31, %zmm3, %zmm5
vmovaps .LCPI0_87(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,30,u,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm7, %zmm7
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm11, %zmm24 # zmm24 = (zmm11 * zmm7) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm0, %xmm7 # xmm7 = xmm0[1,1,3,3]
vbroadcastsd %xmm7, %zmm7
vmovups %zmm27, 352(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_172(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,7,8,9,10,26,u,u,u,u]
vmovups %zmm8, 944(%rsp) # 64-byte Spill
vmovapd %zmm28, %zmm8
vfmadd231ps %zmm7, %zmm11, %zmm15 # zmm15 = (zmm11 * zmm7) + zmm15
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm0, %zmm7
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm7, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm7) + zmm14
vmovaps .LCPI0_76(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,29,u,u,u,u]
vfmadd231ps %zmm2, %zmm11, %zmm16 # zmm16 = (zmm11 * zmm2) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm0, %ymm0, %ymm2 # ymm2 = ymm0[1,1,1,1,5,5,5,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm31, %zmm3, %zmm18
vmovaps .LCPI0_99(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,31,u,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vmovaps %zmm15, %zmm26
vmovups (%rsi,%rax), %zmm15 {%k6} {z}
movq -16(%rsp), %rax # 8-byte Reload
vmovaps %zmm14, %zmm30
vmovups 160(%rsp), %zmm14 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm31, %zmm7, %zmm1
vmovaps %zmm31, %zmm7
vpermt2ps %zmm7, %zmm27, %zmm6
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovaps %zmm7, %zmm9 {%k1}
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovapd .LCPI0_173(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,13,u]
vmovapd .LCPI0_175(%rip), %zmm27 # zmm27 = [0,1,2,3,4,5,6,13]
vpermt2ps %zmm31, %zmm3, %zmm23
vmovaps .LCPI0_100(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,31,u,u,u]
vmovaps %zmm20, %zmm31
vmovups 224(%rsp), %zmm20 # 64-byte Reload
vfmadd231ps %zmm2, %zmm11, %zmm31 # zmm31 = (zmm11 * zmm2) + zmm31
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %xmm0, %xmm0, %xmm2 # xmm2 = xmm0[3,3,3,3]
vbroadcastsd %xmm2, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2pd %zmm8, %zmm7, %zmm6
vmovaps .LCPI0_174(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,26,u,u]
vpermt2ps %zmm28, %zmm3, %zmm23
vmovups 288(%rsp), %zmm28 # 64-byte Reload
vmovaps .LCPI0_101(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,31,u,u]
vfmadd231ps %zmm2, %zmm11, %zmm20 # zmm20 = (zmm11 * zmm2) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf128 $1, %ymm0, %xmm2
vshufps $170, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
vbroadcastss %xmm2, %zmm2
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm11, %zmm14 # zmm14 = (zmm11 * zmm0) + zmm14
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps %zmm12, %zmm2, %zmm11 # zmm11 = (zmm2 * zmm11) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm10, %xmm10, %xmm12 # xmm12 = xmm10[2,2,2,2]
vshufps $170, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm7, %zmm6
vmovaps .LCPI0_102(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,31,u]
vpermt2ps %zmm28, %zmm3, %zmm23
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2pd %zmm25, %zmm27, %zmm6
vmovaps %zmm17, %zmm27
vpermt2ps %zmm25, %zmm7, %zmm23
vmovaps .LCPI0_176(%rip), %zmm7 # zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,26]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm25 # zmm25 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm17, %zmm7, %zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm10, %xmm7 # xmm7 = xmm10[1,1,3,3]
vmovups 944(%rsp), %zmm17 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm30 # zmm30 = (zmm6 * zmm4) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm7, %zmm4
vextractf32x4 $2, %zmm10, %xmm7
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm25, %zmm6, %zmm29 # zmm29 = (zmm6 * zmm25) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm0, %zmm0, %zmm25 # zmm25 = zmm0[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm6, %zmm19 # zmm19 = (zmm6 * zmm3) + zmm19
vmovapd .LCPI0_88(%rip), %zmm3 # zmm3 = [0,1,2,3,4,5,15,u]
vfmadd231ps %zmm4, %zmm6, %zmm26 # zmm26 = (zmm6 * zmm4) + zmm26
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm12, %zmm4
vshuff64x2 $170, %zmm2, %zmm2, %zmm12 # zmm12 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm24 # zmm24 = (zmm6 * zmm4) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm7, %zmm4
vshufps $255, %xmm10, %xmm10, %xmm7 # xmm7 = xmm10[3,3,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm12, %zmm6, %zmm21 # zmm21 = (zmm6 * zmm12) + zmm21
vmovups 96(%rsp), %zmm12 # 64-byte Reload
vfmadd231ps %zmm2, %zmm6, %zmm22 # zmm22 = (zmm6 * zmm2) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm17 # zmm17 = (zmm6 * zmm4) + zmm17
vmovups %zmm24, 32(%rsp) # 64-byte Spill
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $1, %ymm10, %xmm24
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2pd %zmm8, %zmm3, %zmm18
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm15, %zmm15, %zmm3 # zmm3 = zmm15[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vbroadcastss %xmm24, %zmm4
vmovups 416(%rsp), %zmm24 # 64-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm11 # zmm11 = (zmm6 * zmm4) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm25, %zmm6, %zmm12 # zmm12 = (zmm6 * zmm25) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm14 # zmm14 = (zmm6 * zmm4) + zmm14
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm7, %zmm4
vmovups (%rsi,%rax), %zmm7 {%k5} {z}
movw $-32768, %ax # imm = 0x8000
kmovd %eax, %k1
movq 552(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm20 # zmm20 = (zmm6 * zmm4) + zmm20
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovaps %zmm27, %zmm23 {%k1}
kmovw 880(%rsp), %k1 # 2-byte Reload
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
vmovups %zmm14, 160(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7]
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
vmovups (%rsi,%rax), %zmm14 {%k2} {z}
movq -8(%rsp), %rax # 8-byte Reload
vmovaps %zmm20, %zmm25
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_177(%rip), %zmm20 # zmm20 = [0,1,2,3,4,5,6,7,8,9,10,11,27,u,u,u]
vfmadd231ps %zmm4, %zmm6, %zmm16 # zmm16 = (zmm6 * zmm4) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $3, %zmm10, %xmm4
vmovups %zmm31, 1072(%rsp) # 64-byte Spill
vmovups 352(%rsp), %zmm31 # 64-byte Reload
vbroadcastss %xmm4, %zmm4
vmovups (%rsi,%rax), %zmm10 {%k1} {z}
kmovw -120(%rsp), %k1 # 2-byte Reload
movq 24(%rsp), %rax # 8-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm8, %zmm20, %zmm9
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm15, %zmm20
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovapd %zmm8, %zmm5 {%k1}
kmovw 1008(%rsp), %k1 # 2-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm6, %zmm31 # zmm31 = (zmm6 * zmm4) + zmm31
vfmadd213ps %zmm13, %zmm0, %zmm6 # zmm6 = (zmm0 * zmm6) + zmm13
vmovaps .LCPI0_77(%rip), %zmm13 # zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,29,u,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vshufps $170, %zmm15, %zmm15, %zmm0 # zmm0 = zmm15[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm8, %zmm13, %zmm1
vmovaps .LCPI0_178(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,27,u,u]
vmovaps %zmm30, %zmm13
vmovups 1072(%rsp), %zmm30 # 64-byte Reload
vpermt2ps %zmm28, %zmm8, %zmm9
vmovaps .LCPI0_179(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,27,u]
vpermt2ps %zmm24, %zmm8, %zmm9
vmovaps .LCPI0_180(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,27]
vpermt2ps %zmm27, %zmm8, %zmm9
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm15, %xmm8 # xmm8 = xmm15[1,1,3,3]
vmovaps %zmm19, %zmm27
vmovaps %zmm24, %zmm19
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm9, %zmm12 # zmm12 = (zmm9 * zmm4) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm20, %zmm9, %zmm13 # zmm13 = (zmm9 * zmm20) + zmm13
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovups (%rsi,%rax), %zmm20 {%k1} {z}
kmovw -122(%rsp), %k1 # 2-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm9, %zmm6 # zmm6 = (zmm9 * zmm3) + zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm7, %zmm7, %zmm3 # zmm3 = zmm7[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
addq 504(%rsp), %rsi # 8-byte Folded Reload
decl %ebx
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm9, %zmm29 # zmm29 = (zmm9 * zmm4) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $2, %zmm15, %xmm4
vmovups %zmm12, 96(%rsp) # 64-byte Spill
vshuff64x2 $170, %zmm0, %zmm0, %zmm12 # zmm12 = zmm0[4,5,4,5,4,5,4,5]
vbroadcastss %xmm4, %zmm4
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovaps %zmm28, %zmm1 {%k1}
kmovw -124(%rsp), %k1 # 2-byte Reload
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm12, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm12) + zmm21
vfmadd231ps %zmm4, %zmm9, %zmm17 # zmm17 = (zmm9 * zmm4) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm15, %xmm15, %xmm4 # xmm4 = xmm15[2,2,2,2]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm9, %zmm22 # zmm22 = (zmm9 * zmm0) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm2, %zmm2, %zmm0 # zmm0 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_89(%rip), %zmm2 # zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,30,u,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm4, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm9, %zmm27 # zmm27 = (zmm9 * zmm0) + zmm27
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $3, %zmm15, %xmm0
vmovaps %zmm21, %zmm12
vmovups 32(%rsp), %zmm21 # 64-byte Reload
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm9, %zmm31 # zmm31 = (zmm9 * zmm0) + zmm31
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[3,3,3,3,7,7,7,7]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm2, %zmm18
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm15, %ymm15, %ymm2 # ymm2 = ymm15[2,2,2,2,6,6,6,6]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm9, %zmm16 # zmm16 = (zmm9 * zmm0) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm15, %ymm15, %ymm0 # ymm0 = ymm15[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm9, %zmm21 # zmm21 = (zmm9 * zmm4) + zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm8, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_181(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,28,u,u]
vfmadd231ps %zmm0, %zmm9, %zmm30 # zmm30 = (zmm9 * zmm0) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %xmm15, %xmm15, %xmm0 # xmm0 = xmm15[3,3,3,3]
vbroadcastsd %xmm0, %zmm0
.loc 1 221 20 # 03-matrix-multiplication-cpu.py:221:20
vmovapd %zmm19, %zmm18 {%k1}
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm9, %zmm26 # zmm26 = (zmm9 * zmm4) + zmm26
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm7, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm9, %zmm25 # zmm25 = (zmm9 * zmm0) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf128 $1, %ymm15, %xmm0
vmovups 160(%rsp), %zmm15 # 64-byte Reload
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm8, %zmm5
vmovapd .LCPI0_182(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,14]
vmovups 560(%rsp), %zmm28 # 64-byte Reload
vfmadd231ps %zmm2, %zmm9, %zmm15 # zmm15 = (zmm9 * zmm2) + zmm15
vfmadd213ps %zmm11, %zmm0, %zmm9 # zmm9 = (zmm0 * zmm9) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm7, %xmm11 # xmm11 = xmm7[1,1,3,3]
vshufps $170, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vshufps $255, %zmm7, %zmm7, %zmm2 # zmm2 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2pd %zmm24, %zmm8, %zmm5
vmovaps .LCPI0_183(%rip), %zmm8 # zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,28]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $1, %ymm7, %xmm24
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm8, %zmm5
vmovaps %zmm30, %zmm8
vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm11, %zmm4
vextractf32x4 $2, %zmm7, %xmm11
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm26 # zmm26 = (zmm5 * zmm4) + zmm26
vmovaps %zmm13, %zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm7, %xmm7, %xmm13 # xmm13 = xmm7[2,2,2,2]
vbroadcastsd %xmm13, %zmm4
vmovaps %zmm21, %zmm13
vshuff64x2 $170, %zmm3, %zmm3, %zmm21 # zmm21 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm13 # zmm13 = (zmm5 * zmm4) + zmm13
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm11, %zmm4
vmovaps %zmm17, %zmm11
vshuff64x2 $170, %zmm0, %zmm0, %zmm17 # zmm17 = zmm0[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm21, %zmm5, %zmm29 # zmm29 = (zmm5 * zmm21) + zmm29
vmovaps %zmm12, %zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %xmm7, %xmm7, %xmm12 # xmm12 = xmm7[3,3,3,3]
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm17, %zmm5, %zmm21 # zmm21 = (zmm5 * zmm17) + zmm21
vmovups 96(%rsp), %zmm17 # 64-byte Reload
vfmadd231ps %zmm4, %zmm5, %zmm11 # zmm11 = (zmm5 * zmm4) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm4 # zmm4 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm5, %zmm22 # zmm22 = (zmm5 * zmm0) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm14, %zmm14, %zmm0 # zmm0 = zmm14[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm17 # zmm17 = (zmm5 * zmm4) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm24, %zmm4
vmovaps %zmm31, %zmm24
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm9 # zmm9 = (zmm5 * zmm4) + zmm9
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm15 # zmm15 = (zmm5 * zmm4) + zmm15
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm12, %zmm4
vmovaps %zmm27, %zmm12
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm5, %zmm12 # zmm12 = (zmm5 * zmm3) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm0, %zmm0, %zmm27 # zmm27 = zmm0[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
vshufps $255, %zmm14, %zmm14, %zmm3 # zmm3 = zmm14[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm25 # zmm25 = (zmm5 * zmm4) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
vmovups %zmm15, 160(%rsp) # 64-byte Spill
vmovaps %zmm28, %zmm15
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm8 # zmm8 = (zmm5 * zmm4) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm7, %ymm7, %ymm4 # ymm4 = ymm7[3,3,3,3,7,7,7,7]
vextractf32x4 $3, %zmm7, %xmm7
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm16 # zmm16 = (zmm5 * zmm4) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm7, %zmm4
vmovaps %zmm13, %zmm7
vextractf32x4 $2, %zmm14, %xmm13
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm5, %zmm24 # zmm24 = (zmm5 * zmm4) + zmm24
vfmadd213ps %zmm6, %zmm2, %zmm5 # zmm5 = (zmm2 * zmm5) + zmm6
vmovaps .LCPI0_184(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,29,u]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
vshufps $170, %zmm14, %zmm14, %zmm2 # zmm2 = zmm14[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm19, %zmm6, %zmm1
vmovaps .LCPI0_185(%rip), %zmm6 # zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,29]
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm14, %xmm14, %xmm19 # xmm19 = xmm14[2,2,2,2]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm28, %zmm6, %zmm1
vmovups 160(%rsp), %zmm28 # 64-byte Reload
vmovaps %zmm26, %zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm26 # zmm26 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm1, %zmm12 # zmm12 = (zmm1 * zmm0) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $3, %zmm14, %xmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm1, %zmm17 # zmm17 = (zmm1 * zmm4) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm13, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm26, %zmm1, %zmm21 # zmm21 = (zmm1 * zmm26) + zmm21
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm14, %zmm26
vmovshdup %xmm14, %xmm13 # xmm13 = xmm14[1,1,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm1, %zmm22 # zmm22 = (zmm1 * zmm2) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm14, %ymm14, %ymm2 # ymm2 = ymm14[2,2,2,2,6,6,6,6]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm1, %zmm5 # zmm5 = (zmm1 * zmm3) + zmm5
vfmadd231ps %zmm27, %zmm1, %zmm29 # zmm29 = (zmm1 * zmm27) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm10, %zmm10, %zmm3 # zmm3 = zmm10[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm1, %zmm11 # zmm11 = (zmm1 * zmm4) + zmm11
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm19, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vmovaps .LCPI0_186(%rip), %zmm19 # zmm19 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,30]
vfmadd231ps %zmm26, %zmm1, %zmm30 # zmm30 = (zmm1 * zmm26) + zmm30
vfmadd231ps %zmm2, %zmm1, %zmm28 # zmm28 = (zmm1 * zmm2) + zmm28
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm1, %zmm24 # zmm24 = (zmm1 * zmm0) + zmm24
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[3,3,3,3,7,7,7,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm1, %zmm7 # zmm7 = (zmm1 * zmm4) + zmm7
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm13, %zmm4
vshufps $170, %xmm10, %xmm10, %xmm13 # xmm13 = xmm10[2,2,2,2]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm1, %zmm6 # zmm6 = (zmm1 * zmm4) + zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm10, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm1, %zmm16 # zmm16 = (zmm1 * zmm0) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm14, %ymm14, %ymm0 # ymm0 = ymm14[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm1, %zmm8 # zmm8 = (zmm1 * zmm0) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %xmm14, %xmm14, %xmm0 # xmm0 = xmm14[3,3,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vpermt2ps %zmm15, %zmm19, %zmm18
vmovaps %zmm12, %zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm1, %zmm25 # zmm25 = (zmm1 * zmm0) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf128 $1, %ymm14, %xmm0
vshuff64x2 $170, %zmm3, %zmm3, %zmm14 # zmm14 = zmm3[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[6,7,6,7,6,7,6,7]
vbroadcastss %xmm0, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm4) + zmm30
vfmadd231ps %zmm14, %zmm18, %zmm29 # zmm29 = (zmm18 * zmm14) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm2, %zmm2, %zmm14 # zmm14 = zmm2[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm3, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm3) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %zmm20, %zmm20, %zmm3 # zmm3 = zmm20[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd213ps %zmm9, %zmm0, %zmm1 # zmm1 = (zmm0 * zmm1) + zmm9
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm10, %xmm9 # xmm9 = xmm10[1,1,3,3]
vshufps $170, %zmm10, %zmm10, %zmm0 # zmm0 = zmm10[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm14, %zmm18, %zmm17 # zmm17 = (zmm18 * zmm14) + zmm17
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm9, %zmm4
vextractf32x4 $2, %zmm10, %xmm9
vshuff64x2 $170, %zmm0, %zmm0, %zmm15 # zmm15 = zmm0[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm6 # zmm6 = (zmm18 * zmm4) + zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm13, %zmm4
vextractf128 $1, %ymm10, %xmm13
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm0) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm15, %zmm18, %zmm21 # zmm21 = (zmm18 * zmm15) + zmm21
vmovaps %zmm17, %zmm15
vfmadd231ps %zmm4, %zmm18, %zmm7 # zmm7 = (zmm18 * zmm4) + zmm7
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm9, %zmm4
vmovaps %zmm11, %zmm9
vshufps $255, %xmm10, %xmm10, %xmm11 # xmm11 = xmm10[3,3,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm9 # zmm9 = (zmm18 * zmm4) + zmm9
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm13, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm1 # zmm1 = (zmm18 * zmm4) + zmm1
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[2,2,2,2,6,6,6,6]
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
vmovaps %zmm9, %zmm12
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm28 # zmm28 = (zmm18 * zmm4) + zmm28
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm11, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm25 # zmm25 = (zmm18 * zmm4) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[1,1,1,1,5,5,5,5]
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm8 # zmm8 = (zmm18 * zmm4) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm10, %ymm10, %ymm4 # ymm4 = ymm10[3,3,3,3,7,7,7,7]
vextractf32x4 $3, %zmm10, %xmm10
vshuff64x2 $85, %zmm4, %zmm4, %zmm4 # zmm4 = zmm4[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm16 # zmm16 = (zmm18 * zmm4) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm10, %zmm4
vshufps $255, %zmm20, %zmm20, %zmm10 # zmm10 = zmm20[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm18, %zmm24 # zmm24 = (zmm18 * zmm4) + zmm24
vfmadd213ps %zmm5, %zmm2, %zmm18 # zmm18 = (zmm2 * zmm18) + zmm5
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm0, %zmm0, %zmm5 # zmm5 = zmm0[4,5,4,5,4,5,4,5]
vshuff64x2 $170, %zmm10, %zmm10, %zmm4 # zmm4 = zmm10[4,5,4,5,4,5,4,5]
vshuff64x2 $255, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[6,7,6,7,6,7,6,7]
vshuff64x2 $255, %zmm10, %zmm10, %zmm2 # zmm2 = zmm10[6,7,6,7,6,7,6,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm23, %zmm15 # zmm15 = (zmm23 * zmm4) + zmm15
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $170, %zmm3, %zmm3, %zmm4 # zmm4 = zmm3[4,5,4,5,4,5,4,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm22 # zmm22 = (zmm23 * zmm0) + zmm22
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $255, %zmm3, %zmm3, %zmm0 # zmm0 = zmm3[6,7,6,7,6,7,6,7]
vshufps $255, %xmm20, %xmm20, %xmm3 # xmm3 = xmm20[3,3,3,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm5, %zmm23, %zmm21 # zmm21 = (zmm23 * zmm5) + zmm21
vfmadd231ps %zmm4, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm4) + zmm29
vfmadd231ps %zmm0, %zmm23, %zmm19 # zmm19 = (zmm23 * zmm0) + zmm19
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $3, %zmm20, %xmm0
vextractf32x4 $2, %zmm20, %xmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm2, %zmm23, %zmm18 # zmm18 = (zmm23 * zmm2) + zmm18
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vextractf32x4 $1, %ymm20, %xmm2
vbroadcastss %xmm0, %zmm0
vbroadcastss %xmm4, %zmm4
vmovaps %zmm29, %zmm9
vmovaps %zmm24, %zmm29
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm29 # zmm29 = (zmm23 * zmm0) + zmm29
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $255, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[3,3,3,3,7,7,7,7]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm23, %zmm12 # zmm12 = (zmm23 * zmm4) + zmm12
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %xmm20, %xmm20, %xmm4 # xmm4 = xmm20[2,2,2,2]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
vbroadcastsd %xmm4, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm16 # zmm16 = (zmm23 * zmm0) + zmm16
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $85, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[1,1,1,1,5,5,5,5]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm23, %zmm7 # zmm7 = (zmm23 * zmm4) + zmm7
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vmovshdup %xmm20, %xmm4 # xmm4 = xmm20[1,1,3,3]
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
vbroadcastsd %xmm4, %zmm4
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm8 # zmm8 = (zmm23 * zmm0) + zmm8
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastsd %xmm3, %zmm0
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm23, %zmm6 # zmm6 = (zmm23 * zmm4) + zmm6
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm20, %zmm4
vmovups %zmm7, 32(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm25 # zmm25 = (zmm23 * zmm0) + zmm25
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshufps $170, %ymm20, %ymm20, %ymm0 # ymm0 = ymm20[2,2,2,2,6,6,6,6]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm4, %zmm23, %zmm30 # zmm30 = (zmm23 * zmm4) + zmm30
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vshuff64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm28 # zmm28 = (zmm23 * zmm0) + zmm28
.loc 1 222 20 # 03-matrix-multiplication-cpu.py:222:20
vbroadcastss %xmm2, %zmm0
vmovups %zmm25, 224(%rsp) # 64-byte Spill
.loc 1 224 35 # 03-matrix-multiplication-cpu.py:224:35
vfmadd231ps %zmm0, %zmm23, %zmm1 # zmm1 = (zmm23 * zmm0) + zmm1
.loc 1 217 22 # 03-matrix-multiplication-cpu.py:217:22
jne .LBB0_3
jmp .LBB0_4
.LBB0_1:
.loc 1 0 22 is_stmt 0 # 03-matrix-multiplication-cpu.py:0:22
vpxor %xmm0, %xmm0, %xmm0
vxorpd %xmm30, %xmm30, %xmm30
vxorps %xmm6, %xmm6, %xmm6
vpxor %xmm1, %xmm1, %xmm1
vxorps %xmm8, %xmm8, %xmm8
vpxord %xmm28, %xmm28, %xmm28
vxorps %xmm16, %xmm16, %xmm16
vxorps %xmm12, %xmm12, %xmm12
vpxor %xmm9, %xmm9, %xmm9
vpxord %xmm21, %xmm21, %xmm21
vpxor %xmm15, %xmm15, %xmm15
vpxord %xmm29, %xmm29, %xmm29
vxorps %xmm19, %xmm19, %xmm19
vpxord %xmm22, %xmm22, %xmm22
vxorps %xmm18, %xmm18, %xmm18
vmovdqu64 %zmm0, 32(%rsp) # 64-byte Spill
vmovdqu64 %zmm0, 224(%rsp) # 64-byte Spill
.LBB0_4: # %._crit_edge
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vbroadcasti32x4 .LCPI0_21(%rip), %zmm5 # zmm5 = [0,8,0,8,0,8,0,8]
# zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
vmovdqa64 %zmm9, %zmm4
vpunpckldq %xmm4, %xmm12, %xmm24 # xmm24 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
vmovdqa64 %zmm21, %zmm9
vmovdqa64 %zmm15, %zmm20
vpunpckldq %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[0],xmm20[0],xmm9[1],xmm20[1]
vmovdqa64 %zmm29, %zmm26
vpunpckldq %xmm19, %xmm26, %xmm13 # xmm13 = xmm26[0],xmm19[0],xmm26[1],xmm19[1]
vinsertps $76, %xmm12, %xmm4, %xmm14 # xmm14 = xmm12[1],xmm4[1],zero,zero
vinsertps $76, %xmm26, %xmm19, %xmm2 # xmm2 = xmm26[1],xmm19[1],zero,zero
vmovdqa64 %zmm21, %zmm3
movb $-64, %al
vmovaps %zmm12, %zmm23
vmovsd .LCPI0_195(%rip), %xmm17 # xmm17 = [3,7,0,0]
vinsertps $76, %xmm30, %xmm6, %xmm11 # xmm11 = xmm30[1],xmm6[1],zero,zero
vunpckhps %xmm6, %xmm30, %xmm27 # xmm27 = xmm30[2],xmm6[2],xmm30[3],xmm6[3]
vinsertps $76, %xmm1, %xmm8, %xmm7 # xmm7 = xmm1[1],xmm8[1],zero,zero
vmovdqa64 %zmm22, %zmm25
movq 480(%rsp), %r14 # 8-byte Reload
movl -108(%rsp), %r12d # 4-byte Reload
movq $-1, %r15
movq 496(%rsp), %r13 # 8-byte Reload
kmovd %eax, %k1
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
movl 3520(%rsp), %eax
vpbroadcastd %eax, %xmm10
movl -112(%rsp), %eax # 4-byte Reload
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpermt2pd %zmm0, %zmm5, %zmm24
vpunpckldq %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[0],xmm18[0],xmm22[1],xmm18[1]
vpermi2ps %xmm8, %xmm1, %xmm17
vpermt2pd %zmm0, %zmm5, %zmm13
vinsertps $76, %xmm9, %xmm20, %xmm0 # xmm0 = xmm9[1],xmm20[1],zero,zero
vpermt2pd %zmm0, %zmm5, %zmm14
vinsertps $76, %xmm22, %xmm18, %xmm0 # xmm0 = xmm22[1],xmm18[1],zero,zero
vpermt2pd %zmm0, %zmm5, %zmm2
vunpckhps %xmm20, %xmm9, %xmm0 # xmm0 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpbroadcastd %eax, %zmm31
vmovupd %zmm2, 96(%rsp) # 64-byte Spill
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vunpckhps %xmm4, %xmm12, %xmm2 # xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
vpermt2pd %zmm0, %zmm5, %zmm2
vunpckhps %xmm18, %xmm22, %xmm0 # xmm0 = xmm22[2],xmm18[2],xmm22[3],xmm18[3]
vmovupd %zmm2, 160(%rsp) # 64-byte Spill
vunpckhps %xmm19, %xmm26, %xmm2 # xmm2 = xmm26[2],xmm19[2],xmm26[3],xmm19[3]
vpermt2pd %zmm0, %zmm5, %zmm2
vbroadcastsd .LCPI0_32(%rip), %zmm0 # zmm0 = [3,19,3,19,3,19,3,19,3,19,3,19,3,19,3,19]
vshufps $51, %xmm30, %xmm6, %xmm5 # xmm5 = xmm6[3,0],xmm30[3,0]
vmovaps %xmm5, 2096(%rsp) # 16-byte Spill
vunpcklps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[0],ymm6[0],ymm30[1],ymm6[1],ymm30[4],ymm6[4],ymm30[5],ymm6[5]
vmovups %ymm5, 2224(%rsp) # 32-byte Spill
vunpckhps %ymm6, %ymm30, %ymm5 # ymm5 = ymm30[2],ymm6[2],ymm30[3],ymm6[3],ymm30[6],ymm6[6],ymm30[7],ymm6[7]
vmovups %ymm5, 1456(%rsp) # 32-byte Spill
vunpcklps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[0],zmm6[0],zmm30[1],zmm6[1],zmm30[4],zmm6[4],zmm30[5],zmm6[5],zmm30[8],zmm6[8],zmm30[9],zmm6[9],zmm30[12],zmm6[12],zmm30[13],zmm6[13]
vmovups %zmm5, 880(%rsp) # 64-byte Spill
vunpckhps %zmm6, %zmm30, %zmm5 # zmm5 = zmm30[2],zmm6[2],zmm30[3],zmm6[3],zmm30[6],zmm6[6],zmm30[7],zmm6[7],zmm30[10],zmm6[10],zmm30[11],zmm6[11],zmm30[14],zmm6[14],zmm30[15],zmm6[15]
vmovupd %zmm2, 288(%rsp) # 64-byte Spill
vmovaps %zmm12, %zmm2
vmovups %zmm5, 1200(%rsp) # 64-byte Spill
vbroadcastsd .LCPI0_30(%rip), %ymm5 # ymm5 = [5,13,5,13,5,13,5,13]
vpermt2ps %zmm4, %zmm0, %zmm2
vpermt2ps %zmm15, %zmm0, %zmm3
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6]
vmovdqa64 %zmm29, %zmm2
vpermt2ps %zmm19, %zmm0, %zmm2
vpermi2ps %zmm18, %zmm22, %zmm0
vpermi2ps %ymm8, %ymm1, %ymm5
vmovups %ymm5, 2160(%rsp) # 32-byte Spill
vbroadcastsd .LCPI0_13(%rip), %ymm5 # ymm5 = [7,15,7,15,7,15,7,15]
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7]
vbroadcasti32x4 .LCPI0_25(%rip), %zmm0 # zmm0 = [4,5,4,20,4,5,4,20,4,5,4,20,4,5,4,20]
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
vunpcklps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5]
vmovupd %zmm3, 416(%rsp) # 64-byte Spill
vpermi2ps %ymm8, %ymm1, %ymm5
vpermi2ps %zmm15, %zmm21, %zmm0
vmovups %ymm5, 1520(%rsp) # 32-byte Spill
vunpcklps %xmm8, %xmm1, %xmm5 # xmm5 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6]
vbroadcastf128 .LCPI0_196(%rip), %ymm0 # ymm0 = [0,1,4,12,0,1,4,12]
# ymm0 = mem[0,1,0,1]
vunpcklps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[0],ymm19[0],ymm26[1],ymm19[1],ymm26[4],ymm19[4],ymm26[5],ymm19[5]
vpermi2ps %ymm18, %ymm22, %ymm0
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7]
vmovaps .LCPI0_188(%rip), %ymm0 # ymm0 = [1,9,u,u,5,13,u,u]
vbroadcasti32x4 .LCPI0_28(%rip), %zmm2 # zmm2 = [5,21,6,7,5,21,6,7,5,21,6,7,5,21,6,7]
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
vmovupd %zmm3, 560(%rsp) # 64-byte Spill
vunpcklps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[0],ymm20[0],ymm9[1],ymm20[1],ymm9[4],ymm20[4],ymm9[5],ymm20[5]
vpermi2ps %ymm19, %ymm26, %ymm0
vpermi2ps %zmm4, %zmm12, %zmm2
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6]
vunpcklps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[0],ymm18[0],ymm22[1],ymm18[1],ymm22[4],ymm18[4],ymm22[5],ymm18[5]
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2
vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7]
vbroadcasti32x4 .LCPI0_23(%rip), %zmm0 # zmm0 = [4,5,6,22,4,5,6,22,4,5,6,22,4,5,6,22]
# zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
vunpckhps %ymm4, %ymm12, %ymm2 # ymm2 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7]
vmovupd %zmm3, 688(%rsp) # 64-byte Spill
vpermi2ps %zmm15, %zmm21, %zmm0
vmovaps .LCPI0_19(%rip), %zmm15 # zmm15 = [0,1,0,16,4,5,4,20,8,9,8,24,12,13,12,28]
vshuff64x2 $85, %zmm2, %zmm2, %zmm2 # zmm2 = zmm2[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm0, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[5],zmm2[6],zmm0[6]
vbroadcastf128 .LCPI0_197(%rip), %ymm0 # ymm0 = [0,1,6,14,0,1,6,14]
# ymm0 = mem[0,1,0,1]
vunpckhps %ymm19, %ymm26, %ymm2 # ymm2 = ymm26[2],ymm19[2],ymm26[3],ymm19[3],ymm26[6],ymm19[6],ymm26[7],ymm19[7]
vpermi2ps %ymm18, %ymm22, %ymm0
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2
vshufpd $128, %zmm0, %zmm2, %zmm3 {%k1} # zmm3 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7]
vmovaps .LCPI0_191(%rip), %ymm0 # ymm0 = [3,11,u,u,7,15,u,u]
vbroadcasti32x4 .LCPI0_11(%rip), %zmm2 # zmm2 = [7,23,6,7,7,23,6,7,7,23,6,7,7,23,6,7]
# zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
vmovupd %zmm3, 624(%rsp) # 64-byte Spill
vunpckhps %ymm20, %ymm9, %ymm3 # ymm3 = ymm9[2],ymm20[2],ymm9[3],ymm20[3],ymm9[6],ymm20[6],ymm9[7],ymm20[7]
vpermi2ps %ymm19, %ymm26, %ymm0
vpermi2ps %zmm4, %zmm12, %zmm2
vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
vshuff64x2 $85, %zmm3, %zmm3, %zmm3 # zmm3 = zmm3[2,3,2,3,2,3,2,3]
vshufpd $32, %zmm3, %zmm2, %zmm3 # zmm3 = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[5],zmm2[6],zmm3[6]
vunpckhps %ymm18, %ymm22, %ymm2 # ymm2 = ymm22[2],ymm18[2],ymm22[3],ymm18[3],ymm22[6],ymm18[6],ymm22[7],ymm18[7]
vinsertf64x4 $1, %ymm2, %zmm0, %zmm2
vshufpd $128, %zmm2, %zmm0, %zmm3 {%k1} # zmm3 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[7]
vmovaps .LCPI0_18(%rip), %zmm0 # zmm0 = [1,17,2,3,5,21,6,7,9,25,10,11,13,29,14,15]
vmovdqa64 %zmm29, %zmm2
vmovupd %zmm3, 1072(%rsp) # 64-byte Spill
vunpcklps %zmm4, %zmm12, %zmm3 # zmm3 = zmm12[0],zmm4[0],zmm12[1],zmm4[1],zmm12[4],zmm4[4],zmm12[5],zmm4[5],zmm12[8],zmm4[8],zmm12[9],zmm4[9],zmm12[12],zmm4[12],zmm12[13],zmm4[13]
vpermt2ps %zmm19, %zmm0, %zmm2
vmovups %zmm2, 352(%rsp) # 64-byte Spill
vmovaps %zmm0, %zmm2
vpunpckldq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[0],zmm19[0],zmm29[1],zmm19[1],zmm29[4],zmm19[4],zmm29[5],zmm19[5],zmm29[8],zmm19[8],zmm29[9],zmm19[9],zmm29[12],zmm19[12],zmm29[13],zmm19[13]
vpermt2ps %zmm4, %zmm2, %zmm23
vmovdqu64 %zmm0, 1264(%rsp) # 64-byte Spill
vpunpckhdq %zmm19, %zmm29, %zmm0 # zmm0 = zmm29[2],zmm19[2],zmm29[3],zmm19[3],zmm29[6],zmm19[6],zmm29[7],zmm19[7],zmm29[10],zmm19[10],zmm29[11],zmm19[11],zmm29[14],zmm19[14],zmm29[15],zmm19[15]
vunpckhps %zmm4, %zmm12, %zmm29 # zmm29 = zmm12[2],zmm4[2],zmm12[3],zmm4[3],zmm12[6],zmm4[6],zmm12[7],zmm4[7],zmm12[10],zmm4[10],zmm12[11],zmm4[11],zmm12[14],zmm4[14],zmm12[15],zmm4[15]
vmovdqu64 %zmm0, 1328(%rsp) # 64-byte Spill
vmovaps .LCPI0_15(%rip), %zmm0 # zmm0 = [3,19,2,3,7,23,6,7,11,27,10,11,15,31,14,15]
vpermt2ps %zmm4, %zmm0, %zmm12
vbroadcasti128 .LCPI0_29(%rip), %ymm4 # ymm4 = [5,13,6,7,5,13,6,7]
# ymm4 = mem[0,1,0,1]
vpermt2ps %zmm19, %zmm0, %zmm26
vpermi2ps %ymm6, %ymm30, %ymm4
vmovups %ymm4, 2288(%rsp) # 32-byte Spill
vbroadcasti128 .LCPI0_12(%rip), %ymm4 # ymm4 = [7,15,6,7,7,15,6,7]
# ymm4 = mem[0,1,0,1]
vpermi2ps %ymm6, %ymm30, %ymm4
vmovups %ymm4, 1712(%rsp) # 32-byte Spill
vmovaps %zmm30, %zmm4
vpermt2ps %zmm6, %zmm2, %zmm4
vpermi2ps %zmm8, %zmm1, %zmm2
vmovups %zmm2, 1904(%rsp) # 64-byte Spill
vunpckhps %xmm8, %xmm1, %xmm2 # xmm2 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
vmovups %zmm4, 1968(%rsp) # 64-byte Spill
vunpcklps %xmm6, %xmm30, %xmm4 # xmm4 = xmm30[0],xmm6[0],xmm30[1],xmm6[1]
vpermt2ps %zmm6, %zmm0, %zmm30
vmovaps %zmm9, %zmm6
vmovaps %xmm2, 2032(%rsp) # 16-byte Spill
vunpcklps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
vmovups %ymm2, 784(%rsp) # 32-byte Spill
vunpckhps %ymm8, %ymm1, %ymm2 # ymm2 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7]
vmovups %ymm2, 752(%rsp) # 32-byte Spill
vunpcklps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[0],zmm8[0],zmm1[1],zmm8[1],zmm1[4],zmm8[4],zmm1[5],zmm8[5],zmm1[8],zmm8[8],zmm1[9],zmm8[9],zmm1[12],zmm8[12],zmm1[13],zmm8[13]
vmovups %zmm2, 1840(%rsp) # 64-byte Spill
vunpckhps %zmm8, %zmm1, %zmm2 # zmm2 = zmm1[2],zmm8[2],zmm1[3],zmm8[3],zmm1[6],zmm8[6],zmm1[7],zmm8[7],zmm1[10],zmm8[10],zmm1[11],zmm8[11],zmm1[14],zmm8[14],zmm1[15],zmm8[15]
vpermt2ps %zmm8, %zmm0, %zmm1
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm31, %xmm0
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vunpcklps %zmm20, %zmm9, %zmm8 # zmm8 = zmm9[0],zmm20[0],zmm9[1],zmm20[1],zmm9[4],zmm20[4],zmm9[5],zmm20[5],zmm9[8],zmm20[8],zmm9[9],zmm20[9],zmm9[12],zmm20[12],zmm9[13],zmm20[13]
vmovups %zmm2, 1136(%rsp) # 64-byte Spill
vmovaps %zmm21, %zmm2
vpermt2ps %zmm20, %zmm15, %zmm2
vmovaps .LCPI0_17(%rip), %zmm21 # zmm21 = [0,1,2,18,4,5,6,22,8,9,10,26,12,13,14,30]
vmovups %zmm30, 1008(%rsp) # 64-byte Spill
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm0, %eax
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vunpckhps %zmm20, %zmm9, %zmm0 # zmm0 = zmm9[2],zmm20[2],zmm9[3],zmm20[3],zmm9[6],zmm20[6],zmm9[7],zmm20[7],zmm9[10],zmm20[10],zmm9[11],zmm20[11],zmm9[14],zmm20[14],zmm9[15],zmm20[15]
vmovaps %zmm3, %zmm9
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
cltq
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vshufpd $32, %zmm2, %zmm3, %zmm30 # zmm30 = zmm3[0],zmm2[0],zmm3[2],zmm2[2],zmm3[4],zmm2[5],zmm3[6],zmm2[6]
vmovapd .LCPI0_16(%rip), %zmm3 # zmm3 = [2,10,2,10,6,15,6,14]
vpermt2ps %zmm20, %zmm21, %zmm6
vpermt2pd %zmm2, %zmm3, %zmm9
vshufpd $32, %zmm8, %zmm23, %zmm2 # zmm2 = zmm23[0],zmm8[0],zmm23[2],zmm8[2],zmm23[4],zmm8[5],zmm23[6],zmm8[6]
vpermt2pd %zmm8, %zmm3, %zmm23
vmovupd %zmm2, 1584(%rsp) # 64-byte Spill
vshufpd $32, %zmm6, %zmm29, %zmm2 # zmm2 = zmm29[0],zmm6[0],zmm29[2],zmm6[2],zmm29[4],zmm6[5],zmm29[6],zmm6[6]
vpermt2pd %zmm6, %zmm3, %zmm29
vmovupd %zmm2, 1648(%rsp) # 64-byte Spill
vshufpd $32, %zmm0, %zmm12, %zmm2 # zmm2 = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[5],zmm12[6],zmm0[6]
vpermt2pd %zmm0, %zmm3, %zmm12
vmovdqu64 3248(%rsp), %zmm3 # 64-byte Reload
vmovupd %zmm2, 1776(%rsp) # 64-byte Spill
vmovapd %zmm24, %zmm2
vmovups 224(%rsp), %zmm24 # 64-byte Reload
vmovapd %zmm13, %zmm2 {%k1}
vmovupd %zmm9, 2352(%rsp) # 64-byte Spill
vmovupd %zmm29, 816(%rsp) # 64-byte Spill
vmovupd %zmm12, 944(%rsp) # 64-byte Spill
vmovups 32(%rsp), %zmm12 # 64-byte Reload
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpshufd $85, %xmm3, %xmm0 # xmm0 = xmm3[1,1,1,1]
vextracti128 $1, %ymm3, %xmm6
vpshufd $85, %zmm3, %zmm20 # zmm20 = zmm3[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
vpbroadcastq %xmm0, %zmm9
vpbroadcastd %xmm6, %zmm22
vpmulld %xmm10, %xmm6, %xmm6
vpmulld %xmm10, %xmm9, %xmm0
vpextrd $3, %xmm6, %edi
vpextrd $3, %xmm0, %ebp
vpshufd $250, %xmm3, %xmm0 # xmm0 = xmm3[2,2,3,3]
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
movslq %edi, %rdi
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpbroadcastq %xmm0, %zmm8
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rdi,4), %rdi
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm8, %xmm0
vpextrd $3, %xmm0, %ebx
vpshufd $255, %xmm3, %xmm0 # xmm0 = xmm3[3,3,3,3]
vpbroadcastq %xmm0, %zmm29
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %ebx, %rbx
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm29, %xmm0
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rbx,4), %rbx
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm0, %r11d
vpmulld %xmm10, %xmm22, %xmm0
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %r11d, %r11
leaq (%r13,%r11,4), %r11
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm0, %r8d
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vmovddup .LCPI0_194(%rip), %xmm0 # xmm0 = [4,0,4,0]
# xmm0 = mem[0,0]
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %r8d, %r8
leaq (%r13,%r8,4), %r8
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpermi2ps %xmm28, %xmm16, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vinsertf128 $1, %xmm5, %ymm0, %ymm5
vblendps $192, %ymm0, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
vmovlhps %xmm12, %xmm24, %xmm5 # xmm5 = xmm24[0],xmm12[0]
vshufps $36, %xmm5, %xmm4, %xmm5 # xmm5 = xmm4[0,1],xmm5[2,0]
vblendps $15, %ymm5, %ymm0, %ymm0 # ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
vinsertf64x4 $0, %ymm0, %zmm2, %zmm13
vunpcklps %xmm24, %xmm12, %xmm0 # xmm0 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
vinsertf128 $1, %xmm7, %ymm0, %ymm5
vunpcklps %xmm16, %xmm28, %xmm7 # xmm7 = xmm28[0],xmm16[0],xmm28[1],xmm16[1]
vblendps $3, %xmm11, %xmm0, %xmm2 # xmm2 = xmm11[0,1],xmm0[2,3]
vinsertf128 $1, %xmm7, %ymm0, %ymm7
vblendps $192, %ymm7, %ymm5, %ymm0 # ymm0 = ymm5[0,1,2,3,4,5],ymm7[6,7]
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpshufd $85, %ymm3, %ymm5 # ymm5 = ymm3[1,1,1,1,5,5,5,5]
vextracti128 $1, %ymm5, %xmm7
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendps $15, %ymm2, %ymm0, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
vinsertps $179, %xmm24, %xmm12, %xmm2 # xmm2 = zero,zero,xmm12[2],xmm24[2]
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm7, %xmm7
vpextrd $3, %xmm7, %r10d
vpshufd $170, %ymm3, %ymm7 # ymm7 = ymm3[2,2,2,2,6,6,6,6]
vextracti128 $1, %ymm7, %xmm11
vpmulld %xmm10, %xmm11, %xmm11
vpextrd $3, %xmm11, %r9d
vmovdqa 1392(%rsp), %xmm11 # 16-byte Reload
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
movslq %r9d, %r9
leaq (%r13,%r9,4), %r9
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpbroadcastd %xmm11, %zmm19
vpmulld %xmm10, %xmm19, %xmm6
vpextrd $3, %xmm6, %esi
vextracti32x4 $2, %zmm20, %xmm6
vpmulld %xmm10, %xmm6, %xmm6
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %esi, %rsi
leaq (%r13,%rsi,4), %rsi
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm6, %edx
vpshufd $170, %zmm3, %zmm6 # zmm6 = zmm3[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
vextracti32x4 $2, %zmm6, %xmm4
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %edx, %rdx
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm4, %xmm4
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rdx,4), %rdx
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm4, %ecx
vmovupd 96(%rsp), %zmm4 # 64-byte Reload
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %ecx, %rcx
leaq (%r13,%rcx,4), %rcx
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vmovapd %zmm4, %zmm14 {%k1}
vmovaps %xmm27, %xmm4
vinsertf64x4 $0, %ymm0, %zmm14, %zmm0
vblendps $3, %xmm4, %xmm2, %xmm2 # xmm2 = xmm4[0,1],xmm2[2,3]
vinsertf128 $1, 2032(%rsp), %ymm0, %ymm4 # 16-byte Folded Reload
vinsertps $179, %xmm16, %xmm28, %xmm27 # xmm27 = zero,zero,xmm28[2],xmm16[2]
vinsertf32x4 $1, %xmm27, %ymm0, %ymm14
vmovupd 1840(%rsp), %zmm27 # 64-byte Reload
vblendpd $8, %ymm14, %ymm4, %ymm14 # ymm14 = ymm4[0,1,2],ymm14[3]
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpbroadcastd %r14d, %zmm4
xorl %r14d, %r14d
.loc 1 238 58 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:58
cmpl 488(%rsp), %r12d # 4-byte Folded Reload
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm9, %zmm4, %k3
vpcmpgtd %zmm31, %zmm4, %k2
vpcmpgtd %zmm29, %zmm4, %k5
vpcmpgtd %zmm8, %zmm4, %k4
vmovupd 288(%rsp), %zmm9 # 64-byte Reload
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vblendpd $3, %ymm2, %ymm14, %ymm2 # ymm2 = ymm2[0,1],ymm14[2,3]
vmovaps 2096(%rsp), %xmm14 # 16-byte Reload
vmovupd 2352(%rsp), %zmm29 # 64-byte Reload
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
kunpckwd %k2, %k3, %k2
kunpckwd %k4, %k5, %k3
kunpckdq %k2, %k3, %k2
.loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39
kshiftrq $15, %k2, %k2
cmovgeq %r14, %r15
.loc 1 236 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rax,4), %r14
.loc 1 236 52 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:52
movslq %r12d, %rax
.loc 1 238 39 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:39
kmovq %r15, %k0
kandq %k0, %k2, %k2
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpmovm2d %k2, %zmm8
vpbroadcastd %xmm8, %zmm8
kshiftrq $32, %k2, %k4
vpmovd2m %zmm8, %k3
vmovupd 160(%rsp), %zmm8 # 64-byte Reload
vmovups %zmm13, (%r14,%rax,4) {%k3}
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %ebp, %r14
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
kshiftrq $16, %k2, %k3
kshiftrq $48, %k2, %k2
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%r14,4), %r14
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vmovapd %zmm9, %zmm8 {%k1}
vunpckhps %xmm16, %xmm28, %xmm9 # xmm9 = xmm28[2],xmm16[2],xmm28[3],xmm16[3]
vinsertf64x4 $0, %ymm2, %zmm8, %zmm8
vinsertf32x4 $1, %xmm17, %ymm0, %ymm2
vpmovm2d %k3, %zmm17
vpbroadcastd %xmm17, %zmm17
vinsertf128 $1, %xmm9, %ymm0, %ymm9
vpmovd2m %zmm17, %k3
vmovups 1968(%rsp), %zmm17 # 64-byte Reload
vmovups %zmm0, (%r14,%rax,4) {%k3}
vpmovm2d %k4, %zmm0
vpbroadcastd %xmm0, %zmm0
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm22, %zmm4, %k3
vmovups 880(%rsp), %zmm22 # 64-byte Reload
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendpd $8, %ymm9, %ymm2, %ymm2 # ymm2 = ymm2[0,1,2],ymm9[3]
vunpckhps %xmm24, %xmm12, %xmm9 # xmm9 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
vpmovd2m %zmm0, %k4
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vshufi64x2 $85, %zmm5, %zmm5, %zmm0 # zmm0 = zmm5[2,3,2,3,2,3,2,3]
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vunpckhps %ymm24, %ymm12, %ymm5 # ymm5 = ymm12[2],ymm24[2],ymm12[3],ymm24[3],ymm12[6],ymm24[6],ymm12[7],ymm24[7]
vshufps $226, %xmm9, %xmm14, %xmm9 # xmm9 = xmm14[2,0],xmm9[2,3]
vmovups 784(%rsp), %ymm14 # 32-byte Reload
vmovups %zmm8, (%rbx,%rax,4) {%k4}
vpermpd $170, %ymm5, %ymm5 # ymm5 = ymm5[2,2,2,2]
vblendpd $3, %ymm9, %ymm2, %ymm2 # ymm2 = ymm9[0,1],ymm2[2,3]
vmovupd 416(%rsp), %zmm9 # 64-byte Reload
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm0, %zmm4, %k4
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpmovm2d %k2, %zmm0
vpbroadcastd %xmm0, %zmm0
vpmovd2m %zmm0, %k2
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vshufi64x2 $85, %zmm7, %zmm7, %zmm0 # zmm0 = zmm7[2,3,2,3,2,3,2,3]
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
kunpckwd %k3, %k4, %k3
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vmovaps %zmm12, %zmm7
vpermt2ps %zmm24, %zmm15, %zmm7
vinsertf64x4 $0, %ymm2, %zmm9, %zmm13
vmovups 2224(%rsp), %ymm2 # 32-byte Reload
vunpcklps %ymm24, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm24[0],ymm12[1],ymm24[1],ymm12[4],ymm24[4],ymm12[5],ymm24[5]
vpermpd $170, %ymm9, %ymm9 # ymm9 = ymm9[2,2,2,2]
vmovups %zmm13, (%r11,%rax,4) {%k2}
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm0, %zmm4, %k2
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpshufd $255, %ymm3, %ymm0 # ymm0 = ymm3[3,3,3,3,7,7,7,7]
vshufi64x2 $85, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[2,3,2,3,2,3,2,3]
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm0, %zmm4, %k5
kunpckwd %k2, %k5, %k2
kunpckdq %k3, %k2, %k2
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vextractf128 $1, %ymm2, %xmm2
.loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39
kshiftrq $15, %k2, %k2
kandq %k0, %k2, %k5
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3]
vunpcklpd %ymm28, %ymm16, %ymm9 # ymm9 = ymm16[0],ymm28[0],ymm16[2],ymm28[2]
vpmovm2d %k5, %zmm0
vpbroadcastd %xmm0, %zmm0
kshiftrq $32, %k5, %k6
vshufps $36, %ymm9, %ymm14, %ymm9 # ymm9 = ymm14[0,1],ymm9[2,0],ymm14[4,5],ymm9[6,4]
vpmovd2m %zmm0, %k2
vblendps $15, %ymm2, %ymm9, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
vmovups 560(%rsp), %zmm9 # 64-byte Reload
vinsertf64x4 $0, %ymm2, %zmm9, %zmm14
vmovddup .LCPI0_30(%rip), %xmm9 # xmm9 = [5,13,5,13]
# xmm9 = mem[0,0]
vmovups 2288(%rsp), %ymm2 # 32-byte Reload
vmovups %zmm14, (%r8,%rax,4) {%k2}
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %r10d, %r8
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
kshiftrq $16, %k5, %k2
kshiftrq $48, %k5, %k5
vmovups 1200(%rsp), %zmm14 # 64-byte Reload
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%r8,4), %r8
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpermi2ps %ymm24, %ymm12, %ymm9
vextractf128 $1, %ymm2, %xmm2
vblendps $3, %xmm2, %xmm9, %xmm2 # xmm2 = xmm2[0,1],xmm9[2,3]
vunpcklps %ymm16, %ymm28, %ymm9 # ymm9 = ymm28[0],ymm16[0],ymm28[1],ymm16[1],ymm28[4],ymm16[4],ymm28[5],ymm16[5]
vblendps $63, 2160(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload
# ymm9 = mem[0,1,2,3,4,5],ymm9[6,7]
vblendps $15, %ymm2, %ymm9, %ymm0 # ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7]
vmovups 688(%rsp), %zmm2 # 64-byte Reload
vunpcklps %zmm16, %zmm28, %zmm9 # zmm9 = zmm28[0],zmm16[0],zmm28[1],zmm16[1],zmm28[4],zmm16[4],zmm28[5],zmm16[5],zmm28[8],zmm16[8],zmm28[9],zmm16[9],zmm28[12],zmm16[12],zmm28[13],zmm16[13]
vinsertf64x4 $0, %ymm0, %zmm2, %zmm0
vpmovm2d %k2, %zmm2
vpbroadcastd %xmm2, %zmm2
vpmovd2m %zmm2, %k2
vmovups %zmm0, (%r8,%rax,4) {%k2}
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm11, %xmm0
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vmovapd .LCPI0_20(%rip), %zmm11 # zmm11 = [0,8,0,8,4,12,4,13]
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm0, %r8d
vshufi64x2 $255, %zmm20, %zmm20, %zmm0 # zmm0 = zmm20[6,7,6,7,6,7,6,7]
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
movslq %r8d, %r8
.loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm0, %zmm4, %k2
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vshufi64x2 $255, %zmm6, %zmm6, %zmm0 # zmm0 = zmm6[6,7,6,7,6,7,6,7]
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%r8,4), %r8
.loc 1 238 33 is_stmt 1 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm0, %zmm4, %k3
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpshufd $255, %zmm3, %zmm0 # zmm0 = zmm3[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
vmovups 752(%rsp), %ymm3 # 32-byte Reload
vshufi64x2 $255, %zmm0, %zmm0, %zmm2 # zmm2 = zmm0[6,7,6,7,6,7,6,7]
vshufi64x2 $170, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[4,5,4,5,4,5,4,5]
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm2, %zmm4, %k4
vmovups 1456(%rsp), %ymm2 # 32-byte Reload
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vextractf128 $1, %ymm2, %xmm2
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3]
vunpckhpd %ymm28, %ymm16, %ymm5 # ymm5 = ymm16[1],ymm28[1],ymm16[3],ymm28[3]
vshufps $36, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0,1],ymm5[2,0],ymm3[4,5],ymm5[6,4]
vmovups 624(%rsp), %zmm3 # 64-byte Reload
vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
vpmovm2d %k6, %zmm5
vpbroadcastd %xmm5, %zmm5
vpmovd2m %zmm5, %k6
vmovddup .LCPI0_13(%rip), %xmm5 # xmm5 = [7,15,7,15]
# xmm5 = mem[0,0]
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2
vmovups 1072(%rsp), %zmm3 # 64-byte Reload
vmovups %zmm2, (%r9,%rax,4) {%k6}
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vshufi64x2 $170, %zmm20, %zmm20, %zmm2 # zmm2 = zmm20[4,5,4,5,4,5,4,5]
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vpermi2ps %ymm24, %ymm12, %ymm5
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm19, %zmm4, %k6
vmovupd 1904(%rsp), %zmm19 # 64-byte Reload
vpcmpgtd %zmm2, %zmm4, %k7
vmovups 1712(%rsp), %ymm2 # 32-byte Reload
kunpckwd %k6, %k7, %k6
vpcmpgtd %zmm0, %zmm4, %k7
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vmovdqa64 %zmm25, %zmm0
vpermt2ps %zmm18, %zmm15, %zmm0
vpermi2ps %zmm16, %zmm28, %zmm15
vextractf128 $1, %ymm2, %xmm2
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3]
vunpckhps %ymm16, %ymm28, %ymm5 # ymm5 = ymm28[2],ymm16[2],ymm28[3],ymm16[3],ymm28[6],ymm16[6],ymm28[7],ymm16[7]
vblendps $63, 1520(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
# ymm5 = mem[0,1,2,3,4,5],ymm5[6,7]
vblendps $15, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
vpmovm2d %k5, %zmm5
vpbroadcastd %xmm5, %zmm5
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2
vpmovd2m %zmm5, %k5
vmovupd 1584(%rsp), %zmm3 # 64-byte Reload
vmovups %zmm2, (%rdi,%rax,4) {%k5}
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vshufi64x2 $170, %zmm6, %zmm6, %zmm2 # zmm2 = zmm6[4,5,4,5,4,5,4,5]
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm2, %zmm4, %k5
vmovupd 1264(%rsp), %zmm2 # 64-byte Reload
kunpckwd %k5, %k7, %k5
kunpckdq %k6, %k5, %k5
.loc 1 238 39 is_stmt 0 # 03-matrix-multiplication-cpu.py:238:39
kshiftrq $15, %k5, %k5
kandq %k0, %k5, %k5
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vshufpd $128, %zmm0, %zmm2, %zmm29 {%k1} # zmm29 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7]
vpermt2pd %zmm0, %zmm11, %zmm2
vextractf32x4 $2, %zmm7, %xmm0
vmovapd %zmm2, %zmm30 {%k1}
vextractf32x4 $2, %zmm22, %xmm2
vblendps $3, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0,1],xmm0[2,3]
vshuff64x2 $170, %zmm15, %zmm15, %zmm2 # zmm2 = zmm15[4,5,4,5,4,5,4,5]
vshuff64x2 $170, %zmm27, %zmm27, %zmm5 # zmm5 = zmm27[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm2, %ymm5, %ymm2 # ymm2 = ymm5[0,1,2],ymm2[3]
vblendpd $3, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0,1],ymm2[2,3]
vpmovm2d %k5, %zmm2
vpbroadcastd %xmm2, %zmm2
vpmovd2m %zmm2, %k6
vmovupd 352(%rsp), %zmm2 # 64-byte Reload
vinsertf64x4 $0, %ymm0, %zmm30, %zmm0
vmovupd 816(%rsp), %zmm30 # 64-byte Reload
vmovups %zmm0, (%rsi,%rax,4) {%k6}
vpunpckldq %zmm18, %zmm25, %zmm0 # zmm0 = zmm25[0],zmm18[0],zmm25[1],zmm18[1],zmm25[4],zmm18[4],zmm25[5],zmm18[5],zmm25[8],zmm18[8],zmm25[9],zmm18[9],zmm25[12],zmm18[12],zmm25[13],zmm18[13]
kshiftrq $16, %k5, %k6
vshufpd $128, %zmm0, %zmm2, %zmm23 {%k1} # zmm23 {%k1} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[7]
vpermt2pd %zmm0, %zmm11, %zmm2
vunpcklps %zmm24, %zmm12, %zmm0 # zmm0 = zmm12[0],zmm24[0],zmm12[1],zmm24[1],zmm12[4],zmm24[4],zmm12[5],zmm24[5],zmm12[8],zmm24[8],zmm12[9],zmm24[9],zmm12[12],zmm24[12],zmm12[13],zmm24[13]
vextractf32x4 $2, %zmm0, %xmm5
vshuff64x2 $170, %zmm9, %zmm9, %zmm8 # zmm8 = zmm9[4,5,4,5,4,5,4,5]
vmovapd %zmm2, %zmm3 {%k1}
vextractf32x4 $2, %zmm17, %xmm2
vblendps $3, %xmm2, %xmm5, %xmm2 # xmm2 = xmm2[0,1],xmm5[2,3]
vshuff64x2 $170, %zmm19, %zmm19, %zmm5 # zmm5 = zmm19[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm8, %ymm5, %ymm5 # ymm5 = ymm5[0,1,2],ymm8[3]
vextractf32x4 $2, %zmm14, %xmm8
vblendpd $3, %ymm2, %ymm5, %ymm2 # ymm2 = ymm2[0,1],ymm5[2,3]
vpmovm2d %k6, %zmm5
vpbroadcastd %xmm5, %zmm5
vpmovd2m %zmm5, %k6
vmovupd 1328(%rsp), %zmm5 # 64-byte Reload
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2
vmovupd 1648(%rsp), %zmm3 # 64-byte Reload
vmovups %zmm2, (%rdx,%rax,4) {%k6}
vmovdqa64 %zmm25, %zmm2
vpermt2ps %zmm18, %zmm21, %zmm2
kshiftrq $32, %k5, %k6
vshufpd $128, %zmm2, %zmm5, %zmm30 {%k1} # zmm30 {%k1} = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[7]
vpermt2pd %zmm2, %zmm11, %zmm5
vpunpckhdq %zmm18, %zmm25, %zmm2 # zmm2 = zmm25[2],zmm18[2],zmm25[3],zmm18[3],zmm25[6],zmm18[6],zmm25[7],zmm18[7],zmm25[10],zmm18[10],zmm25[11],zmm18[11],zmm25[14],zmm18[14],zmm25[15],zmm18[15]
vmovupd 944(%rsp), %zmm25 # 64-byte Reload
vmovupd 1136(%rsp), %zmm18 # 64-byte Reload
vpermi2pd %zmm2, %zmm26, %zmm11
vmovapd %zmm5, %zmm3 {%k1}
vmovaps %zmm12, %zmm5
vpermt2ps %zmm24, %zmm21, %zmm5
vpermi2ps %zmm16, %zmm28, %zmm21
vshufpd $128, %zmm2, %zmm26, %zmm25 {%k1} # zmm25 {%k1} = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[7]
vshuff64x2 $170, %zmm18, %zmm18, %zmm13 # zmm13 = zmm18[4,5,4,5,4,5,4,5]
vextractf32x4 $2, %zmm5, %xmm2
vblendps $3, %xmm8, %xmm2, %xmm2 # xmm2 = xmm8[0,1],xmm2[2,3]
vshuff64x2 $170, %zmm21, %zmm21, %zmm8 # zmm8 = zmm21[4,5,4,5,4,5,4,5]
vblendpd $8, %ymm8, %ymm13, %ymm8 # ymm8 = ymm13[0,1,2],ymm8[3]
vmovupd 1776(%rsp), %zmm13 # 64-byte Reload
vblendpd $3, %ymm2, %ymm8, %ymm2 # ymm2 = ymm2[0,1],ymm8[2,3]
vpmovm2d %k6, %zmm8
vpbroadcastd %xmm8, %zmm8
vinsertf64x4 $0, %ymm2, %zmm3, %zmm2
vmovdqa 1408(%rsp), %xmm3 # 16-byte Reload
vpmovd2m %zmm8, %k6
vmovups %zmm2, (%rcx,%rax,4) {%k6}
vmovapd %zmm11, %zmm13 {%k1}
vunpckhps %zmm16, %zmm28, %zmm11 # zmm11 = zmm28[2],zmm16[2],zmm28[3],zmm16[3],zmm28[6],zmm16[6],zmm28[7],zmm16[7],zmm28[10],zmm16[10],zmm28[11],zmm16[11],zmm28[14],zmm16[14],zmm28[15],zmm16[15]
kshiftrq $48, %k5, %k1
vshuff64x2 $170, %zmm11, %zmm11, %zmm8 # zmm8 = zmm11[4,5,4,5,4,5,4,5]
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpbroadcastd %xmm3, %zmm2
vpmulld %xmm10, %xmm3, %xmm3
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
vpcmpgtd %zmm2, %zmm4, %k6
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm2, %xmm2
vpextrd $3, %xmm3, %ecx
vpextrd $3, %xmm2, %edi
vextracti32x4 $3, %zmm20, %xmm2
vmovups 1008(%rsp), %zmm20 # 64-byte Reload
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
movslq %ecx, %rcx
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm2, %xmm2
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %edi, %rdi
leaq (%r13,%rcx,4), %rcx
leaq (%r13,%rdi,4), %rdi
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm2, %esi
vextracti32x4 $3, %zmm6, %xmm2
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vshuff64x2 $170, %zmm1, %zmm1, %zmm6 # zmm6 = zmm1[4,5,4,5,4,5,4,5]
vextractf32x4 $3, %zmm0, %xmm0
vextractf64x4 $1, %zmm1, %ymm1
.loc 1 236 33 # 03-matrix-multiplication-cpu.py:236:33
vpmulld %xmm10, %xmm2, %xmm2
.loc 1 236 21 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:21
movslq %esi, %rsi
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vblendpd $8, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0,1,2],ymm8[3]
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rsi,4), %rsi
.loc 1 236 33 is_stmt 0 # 03-matrix-multiplication-cpu.py:236:33
vpextrd $3, %xmm2, %edx
.loc 1 239 21 is_stmt 1 # 03-matrix-multiplication-cpu.py:239:21
vunpckhps %zmm24, %zmm12, %zmm2 # zmm2 = zmm12[2],zmm24[2],zmm12[3],zmm24[3],zmm12[6],zmm24[6],zmm12[7],zmm24[7],zmm12[10],zmm24[10],zmm12[11],zmm24[11],zmm12[14],zmm24[14],zmm12[15],zmm24[15]
vextractf32x4 $2, %zmm20, %xmm3
vextractf32x4 $2, %zmm2, %xmm4
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
movslq %edx, %rdx
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vextractf32x4 $3, %zmm2, %xmm2
.loc 1 236 21 # 03-matrix-multiplication-cpu.py:236:21
leaq (%r13,%rdx,4), %rdx
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendps $3, %xmm3, %xmm4, %xmm4 # xmm4 = xmm3[0,1],xmm4[2,3]
vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3]
vpmovm2d %k1, %zmm6
vpbroadcastd %xmm6, %zmm6
vinsertf64x4 $0, %ymm4, %zmm13, %zmm4
vpmovd2m %zmm6, %k1
vextractf32x4 $3, %zmm22, %xmm6
vmovups %zmm4, (%r8,%rax,4) {%k1}
vextractf32x4 $3, %zmm7, %xmm4
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
kunpckwd %k6, %k2, %k1
kunpckwd %k3, %k4, %k2
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vextractf64x4 $1, %zmm27, %ymm7
.loc 1 238 33 # 03-matrix-multiplication-cpu.py:238:33
kunpckdq %k1, %k2, %k1
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendps $3, %xmm6, %xmm4, %xmm4 # xmm4 = xmm6[0,1],xmm4[2,3]
vextractf64x4 $1, %zmm15, %ymm6
.loc 1 238 39 # 03-matrix-multiplication-cpu.py:238:39
kshiftrq $15, %k1, %k1
kandq %k0, %k1, %k0
.loc 1 239 21 # 03-matrix-multiplication-cpu.py:239:21
vblendpd $8, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0,1,2],ymm6[3]
vblendpd $3, %ymm4, %ymm6, %ymm4 # ymm4 = ymm4[0,1],ymm6[2,3]
vpmovm2d %k0, %zmm6
vpbroadcastd %xmm6, %zmm6
vinsertf64x4 $0, %ymm4, %zmm29, %zmm4
vpmovd2m %zmm6, %k1
vextractf64x4 $1, %zmm9, %ymm6
vmovups %zmm4, (%rdi,%rax,4) {%k1}
vextractf32x4 $3, %zmm17, %xmm4
kshiftrq $16, %k0, %k1
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3]
vextractf64x4 $1, %zmm19, %ymm4
vblendpd $8, %ymm6, %ymm4, %ymm4 # ymm4 = ymm4[0,1,2],ymm6[3]
vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3]
vpmovm2d %k1, %zmm4
vpbroadcastd %xmm4, %zmm4
vinsertf64x4 $0, %ymm0, %zmm23, %zmm0
vpmovd2m %zmm4, %k1
vextractf32x4 $3, %zmm14, %xmm4
vmovups %zmm0, (%rsi,%rax,4) {%k1}
vextractf32x4 $3, %zmm5, %xmm0
vextractf64x4 $1, %zmm18, %ymm5
kshiftrq $32, %k0, %k1
kshiftrq $48, %k0, %k0
vblendps $3, %xmm4, %xmm0, %xmm0 # xmm0 = xmm4[0,1],xmm0[2,3]
vextractf64x4 $1, %zmm21, %ymm4
vblendpd $8, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0,1,2],ymm4[3]
vblendpd $3, %ymm0, %ymm4, %ymm0 # ymm0 = ymm0[0,1],ymm4[2,3]
vpmovm2d %k1, %zmm4
vpbroadcastd %xmm4, %zmm4
vinsertf64x4 $0, %ymm0, %zmm30, %zmm0
vpmovd2m %zmm4, %k1
vmovups %zmm0, (%rdx,%rax,4) {%k1}
vextractf32x4 $3, %zmm20, %xmm0
vblendps $3, %xmm0, %xmm2, %xmm0 # xmm0 = xmm0[0,1],xmm2[2,3]
vextractf64x4 $1, %zmm11, %ymm2
vblendps $192, %ymm2, %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
vblendps $15, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
vpmovm2d %k0, %zmm1
vpbroadcastd %xmm1, %zmm1
vinsertf64x4 $0, %ymm0, %zmm25, %zmm0
vpmovd2m %zmm1, %k1
vmovups %zmm0, (%rcx,%rax,4) {%k1}
.loc 1 239 4 epilogue_begin is_stmt 0 # 03-matrix-multiplication-cpu.py:239:4
addq $3448, %rsp # imm = 0xD78
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
vzeroupper
retq
.Ltmp12:
.Lfunc_end0:
.size matmul_kernel, .Lfunc_end0-matmul_kernel
.cfi_endproc
# -- End function
.section .debug_abbrev,"",@progbits
.byte 1 # Abbreviation Code
.byte 17 # DW_TAG_compile_unit
.byte 1 # DW_CHILDREN_yes
.byte 37 # DW_AT_producer
.byte 14 # DW_FORM_strp
.byte 19 # DW_AT_language
.byte 5 # DW_FORM_data2
.byte 3 # DW_AT_name
.byte 14 # DW_FORM_strp
.byte 16 # DW_AT_stmt_list
.byte 23 # DW_FORM_sec_offset
.byte 27 # DW_AT_comp_dir
.byte 14 # DW_FORM_strp
.byte 17 # DW_AT_low_pc
.byte 1 # DW_FORM_addr
.byte 18 # DW_AT_high_pc
.byte 6 # DW_FORM_data4
.byte 0 # EOM(1)
.byte 0 # EOM(2)
.byte 2 # Abbreviation Code
.byte 46 # DW_TAG_subprogram
.byte 0 # DW_CHILDREN_no
.byte 3 # DW_AT_name
.byte 14 # DW_FORM_strp
.byte 32 # DW_AT_inline
.byte 11 # DW_FORM_data1
.byte 0 # EOM(1)
.byte 0 # EOM(2)
.byte 3 # Abbreviation Code
.byte 46 # DW_TAG_subprogram
.byte 1 # DW_CHILDREN_yes
.byte 17 # DW_AT_low_pc
.byte 1 # DW_FORM_addr
.byte 18 # DW_AT_high_pc
.byte 6 # DW_FORM_data4
.byte 49 # DW_AT_abstract_origin
.byte 19 # DW_FORM_ref4
.byte 0 # EOM(1)
.byte 0 # EOM(2)
.byte 4 # Abbreviation Code
.byte 29 # DW_TAG_inlined_subroutine
.byte 0 # DW_CHILDREN_no
.byte 49 # DW_AT_abstract_origin
.byte 19 # DW_FORM_ref4
.byte 85 # DW_AT_ranges
.byte 23 # DW_FORM_sec_offset
.byte 88 # DW_AT_call_file
.byte 11 # DW_FORM_data1
.byte 89 # DW_AT_call_line
.byte 11 # DW_FORM_data1
.byte 87 # DW_AT_call_column
.byte 11 # DW_FORM_data1
.byte 0 # EOM(1)
.byte 0 # EOM(2)
.byte 0 # EOM(3)
.section .debug_info,"",@progbits
.Lcu_begin0:
.long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
.Ldebug_info_start0:
.short 4 # DWARF version number
.long .debug_abbrev # Offset Into Abbrev. Section
.byte 8 # Address Size (in bytes)
.byte 1 # Abbrev [1] 0xb:0x5c DW_TAG_compile_unit
.long .Linfo_string0 # DW_AT_producer
.short 2 # DW_AT_language
.long .Linfo_string1 # DW_AT_name
.long .Lline_table_start0 # DW_AT_stmt_list
.long .Linfo_string2 # DW_AT_comp_dir
.quad .Lfunc_begin0 # DW_AT_low_pc
.long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
.byte 2 # Abbrev [2] 0x2a:0x6 DW_TAG_subprogram
.long .Linfo_string3 # DW_AT_name
.byte 1 # DW_AT_inline
.byte 3 # Abbrev [3] 0x30:0x36 DW_TAG_subprogram
.quad .Lfunc_begin0 # DW_AT_low_pc
.long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
.long 42 # DW_AT_abstract_origin
.byte 4 # Abbrev [4] 0x41:0xc DW_TAG_inlined_subroutine
.long 42 # DW_AT_abstract_origin
.long .Ldebug_ranges0 # DW_AT_ranges
.byte 1 # DW_AT_call_file
.byte 189 # DW_AT_call_line
.byte 27 # DW_AT_call_column
.byte 4 # Abbrev [4] 0x4d:0xc DW_TAG_inlined_subroutine
.long 42 # DW_AT_abstract_origin
.long .Ldebug_ranges1 # DW_AT_ranges
.byte 1 # DW_AT_call_file
.byte 190 # DW_AT_call_line
.byte 27 # DW_AT_call_column
.byte 4 # Abbrev [4] 0x59:0xc DW_TAG_inlined_subroutine
.long 42 # DW_AT_abstract_origin
.long .Ldebug_ranges2 # DW_AT_ranges
.byte 1 # DW_AT_call_file
.byte 217 # DW_AT_call_line
.byte 33 # DW_AT_call_column
.byte 0 # End Of Children Mark
.byte 0 # End Of Children Mark
.Ldebug_info_end0:
.section .debug_ranges,"",@progbits
.Ldebug_ranges0:
.quad .Ltmp0-.Lfunc_begin0
.quad .Ltmp1-.Lfunc_begin0
.quad .Ltmp2-.Lfunc_begin0
.quad .Ltmp3-.Lfunc_begin0
.quad .Ltmp4-.Lfunc_begin0
.quad .Ltmp5-.Lfunc_begin0
.quad 0
.quad 0
.Ldebug_ranges1:
.quad .Ltmp1-.Lfunc_begin0
.quad .Ltmp2-.Lfunc_begin0
.quad .Ltmp3-.Lfunc_begin0
.quad .Ltmp4-.Lfunc_begin0
.quad .Ltmp5-.Lfunc_begin0
.quad .Ltmp6-.Lfunc_begin0
.quad 0
.quad 0
.Ldebug_ranges2:
.quad .Ltmp7-.Lfunc_begin0
.quad .Ltmp8-.Lfunc_begin0
.quad .Ltmp9-.Lfunc_begin0
.quad .Ltmp10-.Lfunc_begin0
.quad 0
.quad 0
.section .debug_str,"MS",@progbits,1
.Linfo_string0:
.asciz "triton" # string offset=0
.Linfo_string1:
.asciz "03-matrix-multiplication-cpu.py" # string offset=7
.Linfo_string2:
.asciz "/data/users/minjang/triton-oss/triton-cpu/python/tutorials" # string offset=39
.Linfo_string3:
.asciz "matmul_kernel" # string offset=98
.section ".note.GNU-stack","",@progbits
.section .debug_line,"",@progbits
.Lline_table_start0:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment