vassvik · April 27, 2026 16:02
diff --git a/msvc_repro.c b/msvc_repro.c
 /* MSVC /O2 codegen bug — minimum confirmed repro (16 lines).
 * cl 19.39.33521 (VS 2022 17.9), x64.
 *
 * Build:  cl /O2 /nologo msvc_repro.c
 *         cl /O1 /nologo msvc_repro.c /Fe:O1.exe
 *
 * /O1 prints  0.000000  (correct: p=i and p=i+1 contributions cancel).
 * /O2 prints -2.000000  (bug: p=i half of the gather is discarded).
 *
 * Mechanism (see REPORT.md and msvc_repro_O2.asm for full disassembly):
 * /O2 splits the inner gather into two code paths via
 *     cmp esi, 1
 *     jne $LN21@main
 * where esi holds p (constant-folded i = 1).
 *   - $LN21 (taken when p != i): vectorized SSE path. Accumulates the
 *     gather contributions into xmm6, which packs both du and dv.
 *   - $LL44 (fall-through when p == i): scalar fallback. Accumulates into
 *     separate scalar registers xmm7 (dv) and xmm8 (du).
 * The final printf reads xmm6 only. xmm7/xmm8 are never folded back into
 * xmm6, so every iteration with p == i runs the scalar path and its work
 * is discarded. Both paths execute; this is NOT iteration-skipping — the
 * scalar work is computed and silently dropped.
 *
 * Triggers required (each one is necessary):
 *   - 3 nested outer loops (any iteration count, but `for` loops with
 *     non-zero start; do-while works too)
 *   - 3 nested inner loops of the form `for (v = outer; v <= outer+1; v++)`
 *   - the sign for `a` must be a NAMED LOCAL ternary; inlining the ternary
 *     makes the bug disappear
 *   - a SECOND accumulator (`du` here) using inner-var sign factors
 *   - the array index in dv must be one of the inner loop variables
 */
 #include <stdio.h>
 float s[3], du, dv;
 int main(void) {
    s[2] = 1;
    for (int i = 1; i <= 1; i++)
    for (int j = 1; j <= 1; j++)
    for (int k = 1; k <= 1; k++)
    for (int p = i; p <= i+1; p++)
    for (int q = j; q <= j+1; q++)
    for (int r = k; r <= k+1; r++) {
        float a = (p == i) ? 1.f : -1.f;
        du += ((q == j) ? 1.f : -1.f) * ((r == k) ? 1.f : -1.f);
        dv += a * s[r];
    }
    printf("%f\n", du + dv);
    return 0;
 }
diff --git a/msvc_repro_O1.asm b/msvc_repro_O1.asm
 ; Listing generated by Microsoft (R) Optimizing Compiler Version 19.39.33521.0 

 include listing.inc

 INCLUDELIB LIBCMT
 INCLUDELIB OLDNAMES

 _DATA	SEGMENT
 COMM	s:DWORD:03H
 COMM	du:DWORD
 COMM	dv:DWORD
 _DATA	ENDS
 PUBLIC	__local_stdio_printf_options
 PUBLIC	_vfprintf_l
 PUBLIC	printf
 PUBLIC	main
 PUBLIC	??_C@_03PPOCCAPH@?$CFf?6@			; `string'
 PUBLIC	__real@3f800000
 PUBLIC	__real@bf800000
 EXTRN	__acrt_iob_func:PROC
 EXTRN	__stdio_common_vfprintf:PROC
 EXTRN	_fltused:DWORD
 _DATA	SEGMENT
 COMM	?_OptionsStorage@?1??__local_stdio_printf_options@@9@9:QWORD							; `__local_stdio_printf_options'::`2'::_OptionsStorage
 _DATA	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$_vfprintf_l DD imagerel $LN4
 	DD	imagerel $LN4+80
 	DD	imagerel $unwind$_vfprintf_l
 pdata	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$printf DD imagerel $LN6
 	DD	imagerel $LN6+81
 	DD	imagerel $unwind$printf
 pdata	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$main DD	imagerel $LN64
 	DD	imagerel $LN64+286
 	DD	imagerel $unwind$main
 pdata	ENDS
 ;	COMDAT __real@bf800000
 CONST	SEGMENT
 __real@bf800000 DD 0bf800000r			; -1
 CONST	ENDS
 ;	COMDAT __real@3f800000
 CONST	SEGMENT
 __real@3f800000 DD 03f800000r			; 1
 CONST	ENDS
 ;	COMDAT ??_C@_03PPOCCAPH@?$CFf?6@
 CONST	SEGMENT
 ??_C@_03PPOCCAPH@?$CFf?6@ DB '%f', 0aH, 00H		; `string'
 CONST	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$main DD	060f01H
 	DD	07640fH
 	DD	06340fH
 	DD	0700b320fH
 xdata	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$printf DD 041a01H
 	DD	07016521aH
 	DD	030146015H
 xdata	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$_vfprintf_l DD 081401H
 	DD	0a6414H
 	DD	095414H
 	DD	083414H
 	DD	070105214H
 xdata	ENDS
 ; Function compile flags: /Ogspy
 ;	COMDAT main
 _TEXT	SEGMENT
 main	PROC						; COMDAT
 ; File msvc_repro.c
 ; Line 27
 $LN64:
 	mov	QWORD PTR [rsp+8], rbx
 	mov	QWORD PTR [rsp+16], rsi
 	push	rdi
 	sub	rsp, 32					; 00000020H
 ; Line 28
 	movss	xmm5, DWORD PTR __real@3f800000
 ; Line 30
 	mov	esi, 1
 	movss	xmm4, DWORD PTR du
 	mov	ecx, esi
 	movss	xmm0, DWORD PTR dv
 	mov	DWORD PTR s+8, 1065353216		; 3f800000H
 $LL7@main:
 ; Line 31
 	mov	r8d, esi
 	mov	rdx, rsi
 $LL10@main:
 ; Line 32
 	mov	r9d, esi
 	lea	r11d, DWORD PTR [rcx+1]
 $LL13@main:
 ; Line 33
 	mov	r10d, ecx
 	cmp	ecx, r11d
 	jg	SHORT $LN11@main
 	lea	eax, DWORD PTR [r8+1]
 	movsxd	rbx, eax
 $LL16@main:
 ; Line 34
 	mov	rax, rdx
 	cmp	rdx, rbx
 	jg	SHORT $LN14@main
 $LL39@main:
 ; Line 35
 	cmp	r9d, esi
 	jne	SHORT $LN40@main
 	movaps	xmm2, xmm5
 	jmp	SHORT $LN41@main
 $LN40@main:
 	movss	xmm2, DWORD PTR __real@bf800000
 $LN41@main:
 ; Line 36
 	cmp	r10d, ecx
 	jne	SHORT $LN42@main
 	movaps	xmm3, xmm5
 	jmp	SHORT $LN43@main
 $LN42@main:
 	movss	xmm3, DWORD PTR __real@bf800000
 $LN43@main:
 	cmp	rax, rdx
 	jne	SHORT $LN44@main
 	movaps	xmm1, xmm5
 	jmp	SHORT $LN45@main
 $LN44@main:
 	movss	xmm1, DWORD PTR __real@bf800000
 $LN45@main:
 ; Line 37
 	lea	rdi, OFFSET FLAT:s
 	mulss	xmm1, xmm3
 	mulss	xmm2, DWORD PTR [rdi+rax*4]
 	add	rax, rsi
 	addss	xmm4, xmm1
 	addss	xmm0, xmm2
 	cmp	rax, rbx
 	jle	SHORT $LL39@main
 	movss	DWORD PTR dv, xmm0
 	movss	DWORD PTR du, xmm4
 $LN14@main:
 ; Line 33
 	add	r10d, esi
 	cmp	r10d, r11d
 	jle	SHORT $LL16@main
 $LN11@main:
 ; Line 32
 	add	r9d, esi
 	cmp	r9d, 2
 	jle	$LL13@main
 ; Line 31
 	add	r8d, esi
 	add	rdx, rsi
 	cmp	r8d, esi
 	jle	$LL10@main
 ; Line 30
 	add	ecx, esi
 	cmp	ecx, esi
 	jle	$LL7@main
 ; Line 39
 	addss	xmm0, xmm4
 	xorps	xmm1, xmm1
 	lea	rcx, OFFSET FLAT:??_C@_03PPOCCAPH@?$CFf?6@
 	cvtss2sd xmm1, xmm0
 	movq	rdx, xmm1
 	call	printf
 ; Line 41
 	mov	rbx, QWORD PTR [rsp+48]
 	xor	eax, eax
 	mov	rsi, QWORD PTR [rsp+56]
 	add	rsp, 32					; 00000020H
 	pop	rdi
 	ret	0
 main	ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogspy
 ;	COMDAT printf
 _TEXT	SEGMENT
 _Format$ = 80
 printf	PROC						; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
 ; Line 956
 $LN6:
 	mov	rax, rsp
 	mov	QWORD PTR [rax+8], rcx
 	mov	QWORD PTR [rax+16], rdx
 	mov	QWORD PTR [rax+24], r8
 	mov	QWORD PTR [rax+32], r9
 	push	rbx
 	push	rsi
 	push	rdi
 	sub	rsp, 48					; 00000030H
 	mov	rdi, rcx
 ; Line 959
 	lea	rsi, QWORD PTR [rax+16]
 ; Line 960
 	mov	ecx, 1
 	call	__acrt_iob_func
 	mov	rbx, rax
 ; Line 645
 	call	__local_stdio_printf_options
 	xor	r9d, r9d
 	mov	QWORD PTR [rsp+32], rsi
 	mov	r8, rdi
 	mov	rdx, rbx
 	mov	rcx, QWORD PTR [rax]
 	call	__stdio_common_vfprintf
 ; Line 963
 	add	rsp, 48					; 00000030H
 	pop	rdi
 	pop	rsi
 	pop	rbx
 	ret	0
 printf	ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogspy
 ;	COMDAT _vfprintf_l
 _TEXT	SEGMENT
 _Stream$ = 64
 _Format$ = 72
 _Locale$ = 80
 _ArgList$ = 88
 _vfprintf_l PROC					; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
 ; Line 644
 $LN4:
 	mov	QWORD PTR [rsp+8], rbx
 	mov	QWORD PTR [rsp+16], rbp
 	mov	QWORD PTR [rsp+24], rsi
 	push	rdi
 	sub	rsp, 48					; 00000030H
 	mov	rbx, r9
 	mov	rdi, r8
 	mov	rsi, rdx
 	mov	rbp, rcx
 ; Line 645
 	call	__local_stdio_printf_options
 	mov	r9, rdi
 	mov	QWORD PTR [rsp+32], rbx
 	mov	r8, rsi
 	mov	rdx, rbp
 	mov	rcx, QWORD PTR [rax]
 	call	__stdio_common_vfprintf
 ; Line 646
 	mov	rbx, QWORD PTR [rsp+64]
 	mov	rbp, QWORD PTR [rsp+72]
 	mov	rsi, QWORD PTR [rsp+80]
 	add	rsp, 48					; 00000030H
 	pop	rdi
 	ret	0
 _vfprintf_l ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogspy
 ;	COMDAT __local_stdio_printf_options
 _TEXT	SEGMENT
 __local_stdio_printf_options PROC			; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\corecrt_stdio_config.h
 ; Line 92
 	lea	rax, OFFSET FLAT:?_OptionsStorage@?1??__local_stdio_printf_options@@9@9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
 ; Line 93
 	ret	0
 __local_stdio_printf_options ENDP
 _TEXT	ENDS
 END
diff --git a/msvc_repro_O2.asm b/msvc_repro_O2.asm
 ; Listing generated by Microsoft (R) Optimizing Compiler Version 19.39.33521.0 

 include listing.inc

 INCLUDELIB LIBCMT
 INCLUDELIB OLDNAMES

 _DATA	SEGMENT
 COMM	s:DWORD:03H
 COMM	du:DWORD
 COMM	dv:DWORD
 _DATA	ENDS
 PUBLIC	__local_stdio_printf_options
 PUBLIC	_vfprintf_l
 PUBLIC	printf
 PUBLIC	main
 PUBLIC	??_C@_03PPOCCAPH@?$CFf?6@			; `string'
 PUBLIC	__real@3f800000
 PUBLIC	__real@bf800000
 PUBLIC	__xmm@000000000000000000000000bf800000
 EXTRN	__acrt_iob_func:PROC
 EXTRN	__stdio_common_vfprintf:PROC
 EXTRN	_fltused:DWORD
 _DATA	SEGMENT
 COMM	?_OptionsStorage@?1??__local_stdio_printf_options@@9@9:QWORD							; `__local_stdio_printf_options'::`2'::_OptionsStorage
 _DATA	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$_vfprintf_l DD imagerel $LN4
 	DD	imagerel $LN4+80
 	DD	imagerel $unwind$_vfprintf_l
 pdata	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$printf DD imagerel $LN6
 	DD	imagerel $LN6+83
 	DD	imagerel $unwind$printf
 pdata	ENDS
 ;	COMDAT pdata
 pdata	SEGMENT
 $pdata$main DD	imagerel $LN119
 	DD	imagerel $LN119+775
 	DD	imagerel $unwind$main
 pdata	ENDS
 ;	COMDAT __xmm@000000000000000000000000bf800000
 CONST	SEGMENT
 __xmm@000000000000000000000000bf800000 DB 00H, 00H, 080H, 0bfH, 00H, 00H, 00H
 	DB	00H, 00H, 00H, 00H, 00H, 00H, 00H, 00H, 00H
 CONST	ENDS
 ;	COMDAT __real@bf800000
 CONST	SEGMENT
 __real@bf800000 DD 0bf800000r			; -1
 CONST	ENDS
 ;	COMDAT __real@3f800000
 CONST	SEGMENT
 __real@3f800000 DD 03f800000r			; 1
 CONST	ENDS
 ;	COMDAT ??_C@_03PPOCCAPH@?$CFf?6@
 CONST	SEGMENT
 ??_C@_03PPOCCAPH@?$CFf?6@ DB '%f', 0aH, 00H		; `string'
 CONST	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$main DD	0105001H
 	DD	038850H
 	DD	047842H
 	DD	056837H
 	DD	0126418H
 	DD	0115418H
 	DD	0103418H
 	DD	0f014b218H
 	DD	07010e012H
 xdata	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$printf DD 041b01H
 	DD	07017521bH
 	DD	030156016H
 xdata	ENDS
 ;	COMDAT xdata
 xdata	SEGMENT
 $unwind$_vfprintf_l DD 081401H
 	DD	0a6414H
 	DD	095414H
 	DD	083414H
 	DD	070105214H
 xdata	ENDS
 ; Function compile flags: /Ogtpy
 ;	COMDAT main
 _TEXT	SEGMENT
 tv213 = 32
 main	PROC						; COMDAT
 ; File msvc_repro.c
 ; Line 27
 $LN119:
 	mov	QWORD PTR [rsp+8], rbx
 	mov	QWORD PTR [rsp+16], rbp
 	mov	QWORD PTR [rsp+24], rsi
 	push	rdi
 	push	r14
 	push	r15
 	sub	rsp, 96					; 00000060H
 ; Line 28
 	movss	xmm3, DWORD PTR __real@3f800000
 ; Line 30
 	lea	r15, OFFSET FLAT:s
 	movss	xmm4, DWORD PTR __real@bf800000
 	xorps	xmm0, xmm0
 	movaps	XMMWORD PTR [rsp+80], xmm6
 	mov	r10d, 1
 	movaps	XMMWORD PTR [rsp+64], xmm7
 	movss	xmm7, DWORD PTR dv
 	movaps	XMMWORD PTR [rsp+48], xmm8
 	movss	xmm8, DWORD PTR du
 	movaps	xmm6, xmm8
 	mov	DWORD PTR s+8, 1065353216		; 3f800000H
 	unpcklps xmm6, xmm7
 	movlhps	xmm6, xmm0
 	npad	3
 $LL7@main:
 ; Line 31
 	mov	ebp, 1
 	mov	edx, ebp
 	npad	9
 $LL10@main:
 ; Line 32
 	mov	esi, 1
 	lea	r14d, DWORD PTR [r10+1]
 	npad	7
 $LL13@main:
 ; Line 33
 	mov	r9d, r10d
 	cmp	r10d, r14d
 	jg	$LN11@main
 	lea	edi, DWORD PTR [rbp+1]
 	npad	1
 $LL16@main:
 ; Line 34
 	mov	rcx, rdx
 	cmp	ebp, edi
 	jg	$LN14@main
 ; Line 35
 	movsxd	r11, edi
 	cmp	esi, 1
 	jne	SHORT $LN21@main
 $LL44@main:
 ; Line 36
 	cmp	r9d, r10d
 	jne	SHORT $LN45@main
 	movaps	xmm1, xmm3
 	jmp	SHORT $LN46@main
 $LN45@main:
 	movaps	xmm1, xmm4
 $LN46@main:
 	cmp	rcx, rdx
 	jne	SHORT $LN47@main
 	movaps	xmm0, xmm3
 	jmp	SHORT $LN48@main
 $LN47@main:
 	movaps	xmm0, xmm4
 $LN48@main:
 ; Line 37
 	addss	xmm7, DWORD PTR [r15+rcx*4]
 	mulss	xmm0, xmm1
 	inc	rcx
 	addss	xmm8, xmm0
 	cmp	rcx, r11
 	jle	SHORT $LL44@main
 ; Line 35
 	jmp	$LN14@main
 $LN21@main:
 ; Line 34
 	cmp	rdx, r11
 	jg	$LN14@main
 	mov	rax, r11
 	sub	rax, rdx
 	inc	rax
 	cmp	rax, 4
 	jl	$LL94@main
 	lea	rbx, QWORD PTR [r11-3]
 	lea	r8, QWORD PTR [rdx+2]
 	npad	3
 $LL79@main:
 ; Line 36
 	cmp	r9d, r10d
 	jne	SHORT $LN63@main
 	movaps	xmm5, xmm3
 	jmp	SHORT $LN64@main
 $LN63@main:
 	movaps	xmm5, xmm4
 $LN64@main:
 	cmp	rcx, rdx
 	jne	SHORT $LN65@main
 	movaps	xmm2, xmm3
 	jmp	SHORT $LN66@main
 $LN65@main:
 	movaps	xmm2, xmm4
 $LN66@main:
 	movss	xmm0, DWORD PTR [r15+rcx*4]
 	xorps	xmm1, xmm1
 	unpcklps xmm2, xmm0
 	xorps	xmm0, xmm0
 	movlhps	xmm2, xmm1
 	movaps	xmm1, xmm5
 	unpcklps xmm1, xmm0
 	unpcklps xmm1, XMMWORD PTR __xmm@000000000000000000000000bf800000
 	mulps	xmm2, xmm1
 	addps	xmm6, xmm2
 	cmp	r9d, r10d
 	jne	SHORT $LN67@main
 	movaps	xmm5, xmm3
 	jmp	SHORT $LN68@main
 $LN67@main:
 	movaps	xmm5, xmm4
 $LN68@main:
 	lea	rax, QWORD PTR [r8-1]
 	cmp	rax, rdx
 	jne	SHORT $LN69@main
 	movaps	xmm2, xmm3
 	jmp	SHORT $LN70@main
 $LN69@main:
 	movaps	xmm2, xmm4
 $LN70@main:
 	movss	xmm0, DWORD PTR [r15+rcx*4+4]
 	xorps	xmm1, xmm1
 	unpcklps xmm2, xmm0
 	xorps	xmm0, xmm0
 	movlhps	xmm2, xmm1
 	movaps	xmm1, xmm5
 	unpcklps xmm1, xmm0
 	unpcklps xmm1, XMMWORD PTR __xmm@000000000000000000000000bf800000
 	mulps	xmm2, xmm1
 	addps	xmm6, xmm2
 	cmp	r9d, r10d
 	jne	SHORT $LN71@main
 	movaps	xmm5, xmm3
 	jmp	SHORT $LN72@main
 $LN71@main:
 	movaps	xmm5, xmm4
 $LN72@main:
 	cmp	r8, rdx
 	jne	SHORT $LN73@main
 	movaps	xmm2, xmm3
 	jmp	SHORT $LN74@main
 $LN73@main:
 	movaps	xmm2, xmm4
 $LN74@main:
 	movss	xmm0, DWORD PTR [r15+rcx*4+8]
 	xorps	xmm1, xmm1
 	unpcklps xmm2, xmm0
 	xorps	xmm0, xmm0
 	movlhps	xmm2, xmm1
 	movaps	xmm1, xmm5
 	unpcklps xmm1, xmm0
 	unpcklps xmm1, XMMWORD PTR __xmm@000000000000000000000000bf800000
 	mulps	xmm2, xmm1
 	addps	xmm6, xmm2
 	cmp	r9d, r10d
 	jne	SHORT $LN75@main
 	movaps	xmm5, xmm3
 	jmp	SHORT $LN76@main
 $LN75@main:
 	movaps	xmm5, xmm4
 $LN76@main:
 	lea	rax, QWORD PTR [r8+1]
 	cmp	rax, rdx
 	jne	SHORT $LN77@main
 	movaps	xmm2, xmm3
 	jmp	SHORT $LN78@main
 $LN77@main:
 	movaps	xmm2, xmm4
 $LN78@main:
 	movss	xmm0, DWORD PTR [r15+rcx*4+12]
 	xorps	xmm1, xmm1
 	unpcklps xmm2, xmm0
 	add	rcx, 4
 	movlhps	xmm2, xmm1
 	xorps	xmm0, xmm0
 	movaps	xmm1, xmm5
 	add	r8, 4
 	unpcklps xmm1, xmm0
 	unpcklps xmm1, XMMWORD PTR __xmm@000000000000000000000000bf800000
 	mulps	xmm2, xmm1
 	addps	xmm6, xmm2
 	cmp	rcx, rbx
 	jle	$LL79@main
 ; Line 34
 	cmp	rcx, r11
 	jg	SHORT $LN14@main
 	npad	3
 $LL94@main:
 ; Line 36
 	cmp	r9d, r10d
 	jne	SHORT $LN40@main
 	movaps	xmm5, xmm3
 	jmp	SHORT $LN41@main
 $LN40@main:
 	movaps	xmm5, xmm4
 $LN41@main:
 	cmp	rcx, rdx
 	jne	SHORT $LN42@main
 	movaps	xmm2, xmm3
 	jmp	SHORT $LN43@main
 $LN42@main:
 	movaps	xmm2, xmm4
 $LN43@main:
 	movss	xmm0, DWORD PTR [r15+rcx*4]
 	xorps	xmm1, xmm1
 	unpcklps xmm2, xmm0
 	inc	rcx
 	movlhps	xmm2, xmm1
 	movaps	xmm1, xmm5
 	xorps	xmm0, xmm0
 	unpcklps xmm1, xmm0
 	unpcklps xmm1, XMMWORD PTR __xmm@000000000000000000000000bf800000
 	mulps	xmm2, xmm1
 	addps	xmm6, xmm2
 	cmp	rcx, r11
 	jle	SHORT $LL94@main
 $LN14@main:
 ; Line 33
 	inc	r9d
 	cmp	r9d, r14d
 	jle	$LL16@main
 $LN11@main:
 ; Line 32
 	inc	esi
 	cmp	esi, 2
 	jle	$LL13@main
 ; Line 31
 	inc	ebp
 	inc	rdx
 	cmp	ebp, 1
 	jle	$LL10@main
 ; Line 30
 	inc	r10d
 	cmp	r10d, 1
 	jle	$LL7@main
 ; Line 28
 	movaps	xmm0, xmm6
 	movss	DWORD PTR du, xmm6
 	shufps	xmm0, xmm6, 85				; 00000055H
 ; Line 39
 	lea	rcx, OFFSET FLAT:??_C@_03PPOCCAPH@?$CFf?6@
 	movss	DWORD PTR dv, xmm0
 	xorps	xmm1, xmm1
 	addss	xmm0, xmm6
 	cvtss2sd xmm1, xmm0
 	movq	rdx, xmm1
 	call	printf
 ; Line 41
 	movaps	xmm6, XMMWORD PTR [rsp+80]
 	lea	r11, QWORD PTR [rsp+96]
 	mov	rbx, QWORD PTR [r11+32]
 	xor	eax, eax
 	mov	rbp, QWORD PTR [r11+40]
 	mov	rsi, QWORD PTR [r11+48]
 	movaps	xmm8, XMMWORD PTR [r11-48]
 	movaps	xmm7, XMMWORD PTR [rsp+64]
 	mov	rsp, r11
 	pop	r15
 	pop	r14
 	pop	rdi
 	ret	0
 main	ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogtpy
 ;	COMDAT printf
 _TEXT	SEGMENT
 _Format$ = 80
 printf	PROC						; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
 ; Line 956
 $LN6:
 	mov	QWORD PTR [rsp+8], rcx
 	mov	QWORD PTR [rsp+16], rdx
 	mov	QWORD PTR [rsp+24], r8
 	mov	QWORD PTR [rsp+32], r9
 	push	rbx
 	push	rsi
 	push	rdi
 	sub	rsp, 48					; 00000030H
 	mov	rdi, rcx
 ; Line 959
 	lea	rsi, QWORD PTR _Format$[rsp+8]
 ; Line 960
 	mov	ecx, 1
 	call	__acrt_iob_func
 	mov	rbx, rax
 ; Line 645
 	call	__local_stdio_printf_options
 	xor	r9d, r9d
 	mov	QWORD PTR [rsp+32], rsi
 	mov	r8, rdi
 	mov	rdx, rbx
 	mov	rcx, QWORD PTR [rax]
 	call	__stdio_common_vfprintf
 ; Line 963
 	add	rsp, 48					; 00000030H
 	pop	rdi
 	pop	rsi
 	pop	rbx
 	ret	0
 printf	ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogtpy
 ;	COMDAT _vfprintf_l
 _TEXT	SEGMENT
 _Stream$ = 64
 _Format$ = 72
 _Locale$ = 80
 _ArgList$ = 88
 _vfprintf_l PROC					; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
 ; Line 644
 $LN4:
 	mov	QWORD PTR [rsp+8], rbx
 	mov	QWORD PTR [rsp+16], rbp
 	mov	QWORD PTR [rsp+24], rsi
 	push	rdi
 	sub	rsp, 48					; 00000030H
 	mov	rbx, r9
 	mov	rdi, r8
 	mov	rsi, rdx
 	mov	rbp, rcx
 ; Line 645
 	call	__local_stdio_printf_options
 	mov	r9, rdi
 	mov	QWORD PTR [rsp+32], rbx
 	mov	r8, rsi
 	mov	rdx, rbp
 	mov	rcx, QWORD PTR [rax]
 	call	__stdio_common_vfprintf
 ; Line 646
 	mov	rbx, QWORD PTR [rsp+64]
 	mov	rbp, QWORD PTR [rsp+72]
 	mov	rsi, QWORD PTR [rsp+80]
 	add	rsp, 48					; 00000030H
 	pop	rdi
 	ret	0
 _vfprintf_l ENDP
 _TEXT	ENDS
 ; Function compile flags: /Ogtpy
 ;	COMDAT __local_stdio_printf_options
 _TEXT	SEGMENT
 __local_stdio_printf_options PROC			; COMDAT
 ; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\corecrt_stdio_config.h
 ; Line 92
 	lea	rax, OFFSET FLAT:?_OptionsStorage@?1??__local_stdio_printf_options@@9@9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
 ; Line 93
 	ret	0
 __local_stdio_printf_options ENDP
 _TEXT	ENDS
 END
diff --git a/REPORT.md b/REPORT.md
	/* MSVC /O2 codegen bug — minimum confirmed repro (16 lines).
	* cl 19.39.33521 (VS 2022 17.9), x64.
	*
	* Build: cl /O2 /nologo msvc_repro.c
	* cl /O1 /nologo msvc_repro.c /Fe:O1.exe
	*
	* /O1 prints 0.000000 (correct: p=i and p=i+1 contributions cancel).
	* /O2 prints -2.000000 (bug: p=i half of the gather is discarded).
	*
	* Mechanism (see REPORT.md and msvc_repro_O2.asm for full disassembly):
	* /O2 splits the inner gather into two code paths via
	* cmp esi, 1
	* jne $LN21@main
	* where esi holds p (constant-folded i = 1).
	* - $LN21 (taken when p != i): vectorized SSE path. Accumulates the
	* gather contributions into xmm6, which packs both du and dv.
	* - $LL44 (fall-through when p == i): scalar fallback. Accumulates into
	* separate scalar registers xmm7 (dv) and xmm8 (du).
	* The final printf reads xmm6 only. xmm7/xmm8 are never folded back into
	* xmm6, so every iteration with p == i runs the scalar path and its work
	* is discarded. Both paths execute; this is NOT iteration-skipping — the
	* scalar work is computed and silently dropped.
	*
	* Triggers required (each one is necessary):
	* - 3 nested outer loops (any iteration count, but `for` loops with
	* non-zero start; do-while works too)
	* - 3 nested inner loops of the form `for (v = outer; v <= outer+1; v++)`
	* - the sign for `a` must be a NAMED LOCAL ternary; inlining the ternary
	* makes the bug disappear
	* - a SECOND accumulator (`du` here) using inner-var sign factors
	* - the array index in dv must be one of the inner loop variables
	*/
	#include <stdio.h>
	float s[3], du, dv;
	int main(void) {
	s[2] = 1;
	for (int i = 1; i <= 1; i++)
	for (int j = 1; j <= 1; j++)
	for (int k = 1; k <= 1; k++)
	for (int p = i; p <= i+1; p++)
	for (int q = j; q <= j+1; q++)
	for (int r = k; r <= k+1; r++) {
	float a = (p == i) ? 1.f : -1.f;
	du += ((q == j) ? 1.f : -1.f) * ((r == k) ? 1.f : -1.f);
	dv += a * s[r];
	}
	printf("%f\n", du + dv);
	return 0;
	}
	; Listing generated by Microsoft (R) Optimizing Compiler Version 19.39.33521.0

	include listing.inc

	INCLUDELIB LIBCMT
	INCLUDELIB OLDNAMES

	_DATA SEGMENT
	COMM s:DWORD:03H
	COMM du:DWORD
	COMM dv:DWORD
	_DATA ENDS
	PUBLIC __local_stdio_printf_options
	PUBLIC _vfprintf_l
	PUBLIC printf
	PUBLIC main
	PUBLIC ??_C@_03PPOCCAPH@?$CFf?6@ ; `string'
	PUBLIC __real@3f800000
	PUBLIC __real@bf800000
	EXTRN __acrt_iob_func:PROC
	EXTRN __stdio_common_vfprintf:PROC
	EXTRN _fltused:DWORD
	_DATA SEGMENT
	COMM ?_OptionsStorage@?1??__local_stdio_printf_options@@9@9:QWORD ; `__local_stdio_printf_options'::`2'::_OptionsStorage
	_DATA ENDS
	; COMDAT pdata
	pdata SEGMENT
	$pdata$_vfprintf_l DD imagerel $LN4
	DD imagerel $LN4+80
	DD imagerel $unwind$_vfprintf_l
	pdata ENDS
	; COMDAT pdata
	pdata SEGMENT
	$pdata$printf DD imagerel $LN6
	DD imagerel $LN6+81
	DD imagerel $unwind$printf
	pdata ENDS
	; COMDAT pdata
	pdata SEGMENT
	$pdata$main DD imagerel $LN64
	DD imagerel $LN64+286
	DD imagerel $unwind$main
	pdata ENDS
	; COMDAT __real@bf800000
	CONST SEGMENT
	__real@bf800000 DD 0bf800000r ; -1
	CONST ENDS
	; COMDAT __real@3f800000
	CONST SEGMENT
	__real@3f800000 DD 03f800000r ; 1
	CONST ENDS
	; COMDAT ??_C@_03PPOCCAPH@?$CFf?6@
	CONST SEGMENT
	??_C@_03PPOCCAPH@?$CFf?6@ DB '%f', 0aH, 00H ; `string'
	CONST ENDS
	; COMDAT xdata
	xdata SEGMENT
	$unwind$main DD 060f01H
	DD 07640fH
	DD 06340fH
	DD 0700b320fH
	xdata ENDS
	; COMDAT xdata
	xdata SEGMENT
	$unwind$printf DD 041a01H
	DD 07016521aH
	DD 030146015H
	xdata ENDS
	; COMDAT xdata
	xdata SEGMENT
	$unwind$_vfprintf_l DD 081401H
	DD 0a6414H
	DD 095414H
	DD 083414H
	DD 070105214H
	xdata ENDS
	; Function compile flags: /Ogspy
	; COMDAT main
	_TEXT SEGMENT
	main PROC ; COMDAT
	; File msvc_repro.c
	; Line 27
	$LN64:
	mov QWORD PTR [rsp+8], rbx
	mov QWORD PTR [rsp+16], rsi
	push rdi
	sub rsp, 32 ; 00000020H
	; Line 28
	movss xmm5, DWORD PTR __real@3f800000
	; Line 30
	mov esi, 1
	movss xmm4, DWORD PTR du
	mov ecx, esi
	movss xmm0, DWORD PTR dv
	mov DWORD PTR s+8, 1065353216 ; 3f800000H
	$LL7@main:
	; Line 31
	mov r8d, esi
	mov rdx, rsi
	$LL10@main:
	; Line 32
	mov r9d, esi
	lea r11d, DWORD PTR [rcx+1]
	$LL13@main:
	; Line 33
	mov r10d, ecx
	cmp ecx, r11d
	jg SHORT $LN11@main
	lea eax, DWORD PTR [r8+1]
	movsxd rbx, eax
	$LL16@main:
	; Line 34
	mov rax, rdx
	cmp rdx, rbx
	jg SHORT $LN14@main
	$LL39@main:
	; Line 35
	cmp r9d, esi
	jne SHORT $LN40@main
	movaps xmm2, xmm5
	jmp SHORT $LN41@main
	$LN40@main:
	movss xmm2, DWORD PTR __real@bf800000
	$LN41@main:
	; Line 36
	cmp r10d, ecx
	jne SHORT $LN42@main
	movaps xmm3, xmm5
	jmp SHORT $LN43@main
	$LN42@main:
	movss xmm3, DWORD PTR __real@bf800000
	$LN43@main:
	cmp rax, rdx
	jne SHORT $LN44@main
	movaps xmm1, xmm5
	jmp SHORT $LN45@main
	$LN44@main:
	movss xmm1, DWORD PTR __real@bf800000
	$LN45@main:
	; Line 37
	lea rdi, OFFSET FLAT:s
	mulss xmm1, xmm3
	mulss xmm2, DWORD PTR [rdi+rax*4]
	add rax, rsi
	addss xmm4, xmm1
	addss xmm0, xmm2
	cmp rax, rbx
	jle SHORT $LL39@main
	movss DWORD PTR dv, xmm0
	movss DWORD PTR du, xmm4
	$LN14@main:
	; Line 33
	add r10d, esi
	cmp r10d, r11d
	jle SHORT $LL16@main
	$LN11@main:
	; Line 32
	add r9d, esi
	cmp r9d, 2
	jle $LL13@main
	; Line 31
	add r8d, esi
	add rdx, rsi
	cmp r8d, esi
	jle $LL10@main
	; Line 30
	add ecx, esi
	cmp ecx, esi
	jle $LL7@main
	; Line 39
	addss xmm0, xmm4
	xorps xmm1, xmm1
	lea rcx, OFFSET FLAT:??_C@_03PPOCCAPH@?$CFf?6@
	cvtss2sd xmm1, xmm0
	movq rdx, xmm1
	call printf
	; Line 41
	mov rbx, QWORD PTR [rsp+48]
	xor eax, eax
	mov rsi, QWORD PTR [rsp+56]
	add rsp, 32 ; 00000020H
	pop rdi
	ret 0
	main ENDP
	_TEXT ENDS
	; Function compile flags: /Ogspy
	; COMDAT printf
	_TEXT SEGMENT
	_Format$ = 80
	printf PROC ; COMDAT
	; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
	; Line 956
	$LN6:
	mov rax, rsp
	mov QWORD PTR [rax+8], rcx
	mov QWORD PTR [rax+16], rdx
	mov QWORD PTR [rax+24], r8
	mov QWORD PTR [rax+32], r9
	push rbx
	push rsi
	push rdi
	sub rsp, 48 ; 00000030H
	mov rdi, rcx
	; Line 959
	lea rsi, QWORD PTR [rax+16]
	; Line 960
	mov ecx, 1
	call __acrt_iob_func
	mov rbx, rax
	; Line 645
	call __local_stdio_printf_options
	xor r9d, r9d
	mov QWORD PTR [rsp+32], rsi
	mov r8, rdi
	mov rdx, rbx
	mov rcx, QWORD PTR [rax]
	call __stdio_common_vfprintf
	; Line 963
	add rsp, 48 ; 00000030H
	pop rdi
	pop rsi
	pop rbx
	ret 0
	printf ENDP
	_TEXT ENDS
	; Function compile flags: /Ogspy
	; COMDAT _vfprintf_l
	_TEXT SEGMENT
	_Stream$ = 64
	_Format$ = 72
	_Locale$ = 80
	_ArgList$ = 88
	_vfprintf_l PROC ; COMDAT
	; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\stdio.h
	; Line 644
	$LN4:
	mov QWORD PTR [rsp+8], rbx
	mov QWORD PTR [rsp+16], rbp
	mov QWORD PTR [rsp+24], rsi
	push rdi
	sub rsp, 48 ; 00000030H
	mov rbx, r9
	mov rdi, r8
	mov rsi, rdx
	mov rbp, rcx
	; Line 645
	call __local_stdio_printf_options
	mov r9, rdi
	mov QWORD PTR [rsp+32], rbx
	mov r8, rsi
	mov rdx, rbp
	mov rcx, QWORD PTR [rax]
	call __stdio_common_vfprintf
	; Line 646
	mov rbx, QWORD PTR [rsp+64]
	mov rbp, QWORD PTR [rsp+72]
	mov rsi, QWORD PTR [rsp+80]
	add rsp, 48 ; 00000030H
	pop rdi
	ret 0
	_vfprintf_l ENDP
	_TEXT ENDS
	; Function compile flags: /Ogspy
	; COMDAT __local_stdio_printf_options
	_TEXT SEGMENT
	__local_stdio_printf_options PROC ; COMDAT
	; File C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt\corecrt_stdio_config.h
	; Line 92
	lea rax, OFFSET FLAT:?_OptionsStorage@?1??__local_stdio_printf_options@@9@9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
	; Line 93
	ret 0
	__local_stdio_printf_options ENDP
	_TEXT ENDS
	END
Toolset	VS version	/O1	/O2
19.29	VS 16.11	✓	✓
19.30	VS 17.0	✓	✓
19.31	VS 17.1	✓	✓
19.32	VS 17.2	✓	✓
19.33	VS 17.3	✓	✗ BUG introduced
19.34–19.39	VS 17.4 – 17.9	✓	✗
19.40	VS 17.10	✓	✓ FIXED
19.41–19.44	VS 17.11 – 17.14	✓	✓
19.50	VS 18.0 (preview)	✓	✓
Change	Result
2 outer loops instead of 3	bug gone
2 inner loops instead of 3	bug gone
Inner range != 2 (e.g. `<= i+2`)	bug gone
Replace ternary with arithmetic `1 - 2*(p-i)`	bug gone
Inline the `a` ternary directly into `dv += ...`	bug gone
Remove the second accumulator (`du`)	bug gone
Index `s[]` with outer var (`s[i]`) or constant	bug gone
`/O1`, `/Od`	correct
`/O2 /Qvec-`-equivalent unavailable in MSVC	—