April 23, 2026 06:18 · April 13, 2026 21:58 · April 13, 2026 21:59 · April 13, 2026 21:59 · April 11, 2026 20:31 · March 17, 2026 19:46
 diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
 index 6b3d5711b92..aadc577306f 100644
 --- a/Include/internal/pycore_interp_structs.h
 +++ b/Include/internal/pycore_interp_structs.h
 @@ -229,7 +229,8 @@ struct _gc_runtime_state {
     PyObject *callbacks;

     Py_ssize_t heap_size;
 -    Py_ssize_t work_to_do;
 +    /* Total number of young objects since the last complete collection */
 import argparse
 import gc
 import sys
 import time


 def get_memory_usage():
    """Memory usage of the current process in KB."""
    result = {'peak': 0, 'rss': 0}
    with open('/proc/self/status') as status:
 Tested with https://github.com/nascheme/cpython/tree/gc-gen-3.14 (b4ea6d82324cf84f74731a88decd7577eccde2f9)
 Benchmark script: https://gist.github.com/nascheme/6d09885d3696055b9665d3b88c7aacaa

 inc_gc OFF (0) vs ON (1) comparison
   cycle    extra     live    t0(s)    t1(s)     Δt%     rss0     rss1   Δrss%    trash0    trash1
 --------------------------------------------------------------------------------------------------
      10        0      100     0.66     0.73   +10.0      16M      17M      +6        6k       28k
      10        0     1.0k     0.66     0.76   +14.2      17M      20M     +19       14k       82k
      10        0    10.0k     0.78     0.84    +7.8      20M      26M     +29       88k      224k
      10     1.0k      100     0.73     0.76    +4.9      16M      20M     +21        6k       28k
 This is 3.14 branch with https://github.com/python/cpython/pull/142001 applied
 Benchmark script: https://gist.github.com/nascheme/6d09885d3696055b9665d3b88c7aacaa


 inc_gc OFF (0) vs ON (1) comparison
   cycle    extra     live    t0(s)    t1(s)     Δt%     rss0     rss1   Δrss%    trash0    trash1
 --------------------------------------------------------------------------------------------------
      10        0      100     0.65     0.75   +16.3      16M      17M      +2        6k       11k
      10        0     1.0k     0.68     0.78   +14.3      17M      19M     +11       14k       54k
      10        0    10.0k     0.76     0.89   +17.8      20M      22M     +10       88k      138k
 base: d76c56e958c9a603ded42d27b39ab51c1e3794e4 (head of 3.14 branch)
 gen_gc: https://github.com/nascheme/cpython/tree/gc-gen-3.14 (b4ea6d82324cf84f74731a88decd7577eccde2f9)
 inc_gc: as above but with -Xinc_gc on

 Benchmark results produced by "fastbench" 2ab489f1f315582b8a537b537a7664898d90a0a4


 Benchmark                        base ms     gen_gc ms         ratio     inc_gc ms         ratio
 ------------------------------------------------------------------------------------------------
 2to3                                95.2          96.6         1.015          97.7         1.026
 # CUDA contention benchmark v2: realistic LLM-like workload.
 #
 # Simulates a transformer decode step more faithfully than v1:
 # - Multiple linear layers (like attention Q/K/V projections + MLP)
 # - Small element-wise ops (RMSNorm, activations, residual adds)
 # - Index/gather operations (like token embedding lookup, KV cache scatter)
 # - Tensor creation/destruction churn
 # - Periodic CPU readback (like reading sampled token IDs)
 #
 # This should stress the same code paths as a real LLM step — many CUDA
 # Dual-engine multi-GPU threaded vLLM throughput benchmark.
 #
 # Architecture: two independent LLMEngine instances (one per GPU) fed from
 # a shared tokenized-request queue, with a single tokenizer thread.
 #
 #   Tokenizer Thread (CPU)         Engine Thread 0 (cuda:0)   Engine Thread 1 (cuda:1)
 #     input_processor.process()    add_request (from queue)   add_request (from queue)
 #     tokenized_queue.put(ecr)     engine0.step()             engine1.step()
 #                                  (continuous streaming)      (continuous streaming)
 #
 # See: https://github.com/pytest-dev/pytest/issues/14077
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import pytest

 @pytest.fixture(scope="module")
 def pool():
    yield ThreadPoolExecutor(max_workers=1)

 def test_1(pool):
	diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
	index 6b3d5711b92..aadc577306f 100644
	--- a/Include/internal/pycore_interp_structs.h
	+++ b/Include/internal/pycore_interp_structs.h
	@@ -229,7 +229,8 @@ struct _gc_runtime_state {
	PyObject *callbacks;

	Py_ssize_t heap_size;
	- Py_ssize_t work_to_do;
	+ /* Total number of young objects since the last complete collection */
	import argparse
	import gc
	import sys
	import time


	def get_memory_usage():
	"""Memory usage of the current process in KB."""
	result = {'peak': 0, 'rss': 0}
	with open('/proc/self/status') as status:
	Tested with https://github.com/nascheme/cpython/tree/gc-gen-3.14 (b4ea6d82324cf84f74731a88decd7577eccde2f9)
	Benchmark script: https://gist.github.com/nascheme/6d09885d3696055b9665d3b88c7aacaa

	inc_gc OFF (0) vs ON (1) comparison
	cycle extra live t0(s) t1(s) Δt% rss0 rss1 Δrss% trash0 trash1
	--------------------------------------------------------------------------------------------------
	10 0 100 0.66 0.73 +10.0 16M 17M +6 6k 28k
	10 0 1.0k 0.66 0.76 +14.2 17M 20M +19 14k 82k
	10 0 10.0k 0.78 0.84 +7.8 20M 26M +29 88k 224k
	10 1.0k 100 0.73 0.76 +4.9 16M 20M +21 6k 28k
	This is 3.14 branch with https://github.com/python/cpython/pull/142001 applied
	Benchmark script: https://gist.github.com/nascheme/6d09885d3696055b9665d3b88c7aacaa


	inc_gc OFF (0) vs ON (1) comparison
	cycle extra live t0(s) t1(s) Δt% rss0 rss1 Δrss% trash0 trash1
	--------------------------------------------------------------------------------------------------
	10 0 100 0.65 0.75 +16.3 16M 17M +2 6k 11k
	10 0 1.0k 0.68 0.78 +14.3 17M 19M +11 14k 54k
	10 0 10.0k 0.76 0.89 +17.8 20M 22M +10 88k 138k
	base: d76c56e958c9a603ded42d27b39ab51c1e3794e4 (head of 3.14 branch)
	gen_gc: https://github.com/nascheme/cpython/tree/gc-gen-3.14 (b4ea6d82324cf84f74731a88decd7577eccde2f9)
	inc_gc: as above but with -Xinc_gc on

	Benchmark results produced by "fastbench" 2ab489f1f315582b8a537b537a7664898d90a0a4


	Benchmark base ms gen_gc ms ratio inc_gc ms ratio
	------------------------------------------------------------------------------------------------
	2to3 95.2 96.6 1.015 97.7 1.026
	# CUDA contention benchmark v2: realistic LLM-like workload.
	#
	# Simulates a transformer decode step more faithfully than v1:
	# - Multiple linear layers (like attention Q/K/V projections + MLP)
	# - Small element-wise ops (RMSNorm, activations, residual adds)
	# - Index/gather operations (like token embedding lookup, KV cache scatter)
	# - Tensor creation/destruction churn
	# - Periodic CPU readback (like reading sampled token IDs)
	#
	# This should stress the same code paths as a real LLM step — many CUDA
	# Dual-engine multi-GPU threaded vLLM throughput benchmark.
	#
	# Architecture: two independent LLMEngine instances (one per GPU) fed from
	# a shared tokenized-request queue, with a single tokenizer thread.
	#
	# Tokenizer Thread (CPU) Engine Thread 0 (cuda:0) Engine Thread 1 (cuda:1)
	# input_processor.process() add_request (from queue) add_request (from queue)
	# tokenized_queue.put(ecr) engine0.step() engine1.step()
	# (continuous streaming) (continuous streaming)
	#
	# See: https://github.com/pytest-dev/pytest/issues/14077
	from concurrent.futures import ThreadPoolExecutor
	import numpy as np
	import pytest

	@pytest.fixture(scope="module")
	def pool():
	yield ThreadPoolExecutor(max_workers=1)

	def test_1(pool):