Skip to content

Instantly share code, notes, and snippets.

@Helw150
Created April 10, 2026 01:29
Show Gist options
  • Select an option

  • Save Helw150/771c90b1ad424cda26085706a73b33d9 to your computer and use it in GitHub Desktop.

Select an option

Save Helw150/771c90b1ad424cda26085706a73b33d9 to your computer and use it in GitHub Desktop.
Python execution tracer prototype for SWE-bench-style Docker images
"""Benchmark end-to-end trace pipeline on multiple SWE-rebench-V2 Python images."""
import json
import os
import subprocess
import sys
import time
IMAGES = [
{"instance_id": "wtforms__wtforms-614", "image_name": "docker.io/swerebenchv2/wtforms-wtforms:614-848d28d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_fields.py tests/test_validators.py tests/test_widgets.py"},
{"instance_id": "aio-libs__aiohttp-9047", "image_name": "docker.io/swerebenchv2/aio-libs-aiohttp:9047-aca99bc", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_connector.py tests/test_proxy.py tests/test_resolver.py"},
{"instance_id": "bvanelli__actualpy-56", "image_name": "docker.io/swerebenchv2/bvanelli-actualpy:56-2ad5f63", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_rules.py"},
{"instance_id": "keras-team__keras-19955", "image_name": "docker.io/swerebenchv2/keras-team-keras:19955-ca9519b", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning keras/src/backend/common/variables_test.py"},
{"instance_id": "aws-cloudformation__cfn-lint-3965", "image_name": "docker.io/swerebenchv2/aws-cloudformation-cfn-lint:3965-b4d790d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning test/unit/rules/functions/test_ref_format.py"},
{"instance_id": "qiskit__qiskit-terra-5662", "image_name": "docker.io/swerebenchv2/qiskit-qiskit-terra:5662-3accb1b", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning test/python/visualization/test_circuit_text_drawer.py test/python/visualization/timeline/test_core.py"},
{"instance_id": "mozilla-services__cliquet-203", "image_name": "docker.io/swerebenchv2/mozilla-services-cliquet:203-41a48da", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning cliquet/tests/test_initialization.py"},
{"instance_id": "azure__walinuxagent-970", "image_name": "docker.io/swerebenchv2/azure-walinuxagent:970-fc2451f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/common/test_errorstate.py"},
{"instance_id": "tomwhite__cubed-211", "image_name": "docker.io/swerebenchv2/tomwhite-cubed:211-4305c85", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning cubed/tests/runtime/test_lithops.py cubed/tests/runtime/test_modal_async.py cubed/tests/runtime/test_python_async.py cubed/tests/runtime/utils.py"},
{"instance_id": "pandas-dev__pandas-59608", "image_name": "docker.io/swerebenchv2/pandas-dev-pandas:59608-360597c", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning pandas/tests/reshape/merge/test_join.py pandas/tests/series/test_arrow_interface.py"},
{"instance_id": "sissbruecker__linkding-984", "image_name": "docker.io/swerebenchv2/sissbruecker-linkding:984-c5a300a", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning bookmarks/tests/test_bookmark_archived_view.py bookmarks/tests/test_bookmark_index_view.py bookmarks/tests/test_bookmark_shared_view.py"},
{"instance_id": "databricks__dbt-databricks-935", "image_name": "docker.io/swerebenchv2/databricks-dbt-databricks:935-4b1d2d9", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/functional/adapter/liquid_clustering/fixtures.py tests/functional/adapter/liquid_clustering/test_liquid_clustering.py tests/unit/macros/relations/test_table_macros.py"},
{"instance_id": "tox-dev__sphinx-autodoc-typehints-474", "image_name": "docker.io/swerebenchv2/tox-dev-sphinx-autodoc-typehints:474-c8be42f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_sphinx_autodoc_typehints.py"},
{"instance_id": "meltano__sdk-1881", "image_name": "docker.io/swerebenchv2/meltano-sdk:1881-c3a8f90", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/core/test_target_base.py"},
{"instance_id": "fsspec__universal_pathlib-148", "image_name": "docker.io/swerebenchv2/fsspec-universal_pathlib:148-2a29aa4", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning upath/tests/cases.py upath/tests/implementations/test_gcs.py upath/tests/implementations/test_hdfs.py upath/tests/implementations/test_webdav.py"},
{"instance_id": "tefra__xsdata-310", "image_name": "docker.io/swerebenchv2/tefra-xsdata:310-ce88e71", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/formats/dataclass/parsers/test_nodes.py tests/formats/dataclass/parsers/test_utils.py tests/formats/dataclass/serializers/test_xml.py tests/formats/dataclass/test_context.py tests/formats/dataclass/test_elements.py tests/models/enums/test_datatype.py"},
{"instance_id": "pyqtgraph__pyqtgraph-1845", "image_name": "docker.io/swerebenchv2/pyqtgraph-pyqtgraph:1845-ba517ab", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_colormap.py"},
{"instance_id": "pallets__werkzeug-2583", "image_name": "docker.io/swerebenchv2/pallets-werkzeug:2583-1ce57f6", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_routing.py"},
{"instance_id": "pytest-dev__pyfakefs-916", "image_name": "docker.io/swerebenchv2/pytest-dev-pyfakefs:916-95b2de3", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning pyfakefs/tests/fake_pathlib_test.py"},
{"instance_id": "cs-si__eodag-228", "image_name": "docker.io/swerebenchv2/cs-si-eodag:228-fec965f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_end_to_end.py tests/units/test_core.py"},
]
TRACER_DIR = os.path.expanduser("~/marin/symbolic/pytracer")
OUTPUT_DIR = "/tmp/tracer_bench"
def run(cmd, timeout=300):
return subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
def bench_one(img):
instance_id = img["instance_id"]
image_name = img["image_name"]
test_cmd = img["test_cmd"]
output_file = f"{OUTPUT_DIR}/{instance_id}.jsonl"
print(f"\n{'='*60}")
print(f"Instance: {instance_id}")
print(f"Image: {image_name}")
print(f"Test cmd: {test_cmd}")
print(f"{'='*60}")
timings = {}
# Pull
t0 = time.monotonic()
r = run(f"sudo docker pull {image_name}", timeout=120)
timings["pull"] = time.monotonic() - t0
if r.returncode != 0:
print(f" PULL FAILED: {r.stderr[-200:]}")
return None
# Run tests with tracer
docker_cmd = (
f"sudo docker run --rm "
f"-v {TRACER_DIR}:/pytracer "
f"-v {OUTPUT_DIR}:{OUTPUT_DIR} "
f"-e PYTHONPATH=/pytracer "
f"-e TRACER_OUTPUT={output_file} "
f"-e TRACER_MAX_EVENTS=500000 "
f"{image_name} {test_cmd}"
)
t0 = time.monotonic()
r = run(docker_cmd, timeout=300)
timings["test+trace"] = time.monotonic() - t0
# Print test output tail
lines = (r.stdout + r.stderr).strip().split("\n")
for line in lines[-5:]:
print(f" {line}")
# Analyze output
events = 0
chars = 0
size_mb = 0
if os.path.exists(output_file):
size_mb = os.path.getsize(output_file) / 1024 / 1024
with open(output_file) as f:
for line in f:
events += 1
chars += len(line)
os.remove(output_file)
# Delete image
t0 = time.monotonic()
run(f"sudo docker rmi {image_name}")
timings["rmi"] = time.monotonic() - t0
total = sum(timings.values())
est_tokens = chars // 4
print(f"\n Timings: pull={timings['pull']:.1f}s test+trace={timings['test+trace']:.1f}s rmi={timings['rmi']:.1f}s total={total:.1f}s")
print(f" Events: {events:,} Size: {size_mb:.1f}MB Est tokens: {est_tokens:,}")
return {
"instance_id": instance_id,
"timings": timings,
"total_time": total,
"events": events,
"size_mb": size_mb,
"est_tokens": est_tokens,
}
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Benchmarking {len(IMAGES)} unique Python repos\n")
results = []
for img in IMAGES:
try:
result = bench_one(img)
if result:
results.append(result)
except Exception as e:
print(f" ERROR: {e}")
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
for r in results:
print(f" {r['instance_id']:45s} {r['total_time']:6.1f}s {r['events']:>8,} events {r['est_tokens']:>10,} tokens")
if results:
avg_time = sum(r["total_time"] for r in results) / len(results)
avg_tokens = sum(r["est_tokens"] for r in results) / len(results)
print(f"\n Average: {avg_time:.1f}s per image, {avg_tokens:,.0f} tokens per image")
print(f" Projected for 7,243 images: {avg_time * 7243 / 3600:.1f} hours, {avg_tokens * 7243 / 1e9:.1f}B tokens")
if __name__ == "__main__":
t0 = time.monotonic()
main()
print(f"\nTotal wall time: {time.monotonic() - t0:.1f}s")
"""Show interesting trace events from a JSONL trace file."""
import json
import sys
path = sys.argv[1] if len(sys.argv) > 1 else "/tmp/tracer_out/trace.jsonl"
events = [json.loads(line) for line in open(path)]
print(f"Total events: {len(events)}")
# Show event type distribution
from collections import Counter
kinds = Counter(e["event"] for e in events)
print(f"Event types: {dict(kinds)}")
# Show unique functions traced
funcs = Counter(f"{e['file']}::{e['function']}" for e in events)
print(f"\nTop 20 functions by event count:")
for func, count in funcs.most_common(20):
print(f" {count:5d} {func}")
# Show a sample function trace (first non-module call with >3 events)
seen = {}
for e in events:
key = (e["file"], e["function"])
if e["function"] == "<module>":
continue
seen.setdefault(key, []).append(e)
print("\n--- Sample function traces (with locals) ---")
# Group into individual invocations (call->return sequences)
invocations = []
stack = {}
for e in events:
key = (e["file"], e["function"], e["depth"])
if e["event"] == "call":
stack[key] = [e]
elif key in stack:
stack[key].append(e)
if e["event"] == "return":
invocations.append(stack.pop(key))
# Find invocations with real locals (not class defs, not trivial)
shown = 0
for inv in invocations:
non_ellipsis_locals = sum(
1 for e in inv for v in e["locals"].values() if v != "..."
)
if (len(inv) >= 5 and non_ellipsis_locals >= 3
and inv[0]["function"] not in ("<module>", "<dictcomp>", "<listcomp>")
and not inv[0]["function"][0].isupper() # skip class defs
and shown < 3):
print(f"\nFunction: {inv[0]['function']} in {inv[0]['file']} ({len(inv)} events)")
for e in inv[:25]:
print(json.dumps(e))
if len(inv) > 25:
print(f" ... ({len(inv) - 25} more events)")
shown += 1
"""Auto-inject tracer when imported via PYTHONPATH.
Activated by setting PYTHONPATH to include the directory containing this file.
Configure via environment variables:
TRACER_OUTPUT - path to write JSONL output (default: /tmp/trace.jsonl)
TRACER_REPO_ROOT - repo root for relative paths (default: cwd)
TRACER_MAX_EVENTS - max events before stopping (default: 50000)
TRACER_MAX_REPR - max repr length per variable (default: 200)
"""
import atexit
import os
import sys
def _setup_tracer():
from tracer import Tracer
repo_root = os.environ.get("TRACER_REPO_ROOT", os.getcwd())
output_path = os.environ.get("TRACER_OUTPUT", "/tmp/trace.jsonl")
tracer = Tracer(repo_root=repo_root)
tracer.start()
def _on_exit():
tracer.stop()
tracer.dump(output_path)
count = len(tracer.events)
print(f"\n[tracer] Wrote {count} events to {output_path}", file=sys.stderr)
atexit.register(_on_exit)
_setup_tracer()
"""Execution tracer that captures Python program state at interpreter events.
Uses sys.settrace to record call/line/return/exception events with local
variable snapshots. Filters to repo code only (skips stdlib, site-packages).
Outputs JSONL with compressed unchanged variables.
"""
import json
import os
import sys
import threading
from pathlib import Path
# Paths that indicate non-repo code.
_SKIP_MARKERS = (
"site-packages",
"lib/python",
"/usr/lib",
"importlib",
"<frozen",
"/pytracer/",
"/.local/",
"/usr/local/lib/",
"_pytest/",
"pluggy/",
"conftest.py",
"_distutils_hack",
"pkg_resources",
)
# Max events before we stop tracing (safety valve).
MAX_EVENTS = int(os.environ.get("TRACER_MAX_EVENTS", "50000"))
# Max repr length for a single variable value.
MAX_REPR_LEN = int(os.environ.get("TRACER_MAX_REPR", "200"))
def _safe_repr(obj):
"""repr() that never raises and truncates long values."""
try:
r = repr(obj)
except Exception:
return "<repr-error>"
if len(r) > MAX_REPR_LEN:
return r[:MAX_REPR_LEN] + "..."
return r
def _should_skip(filename):
if not filename:
return True
for marker in _SKIP_MARKERS:
if marker in filename:
return True
return False
class Tracer:
"""Captures execution traces as JSONL.
Usage:
tracer = Tracer(repo_root="/path/to/repo")
tracer.start()
# ... run code ...
tracer.stop()
tracer.dump("/path/to/output.jsonl")
"""
def __init__(self, repo_root=None):
self.repo_root = str(Path(repo_root).resolve()) if repo_root else None
self.events = []
self.prev_locals = {} # keyed by frame id
self._lock = threading.Lock()
self._stopped = False
def start(self):
self._stopped = False
sys.settrace(self._trace)
threading.settrace(self._trace)
def stop(self):
self._stopped = True
sys.settrace(None)
threading.settrace(None)
def _trace(self, frame, event, arg):
try:
if self._stopped or len(self.events) >= MAX_EVENTS:
return None
filename = frame.f_code.co_filename
if _should_skip(filename):
return None
record = self._build_record(frame, event, arg)
if record:
with self._lock:
self.events.append(record)
return self._trace
except Exception:
return self._trace
def _build_record(self, frame, event, arg):
filename = frame.f_code.co_filename
if self.repo_root:
filename = os.path.relpath(filename, self.repo_root)
frame_id = id(frame)
# Snapshot locals, compress unchanged.
current_locals = {}
for k, v in frame.f_locals.items():
if k.startswith("__") and k.endswith("__"):
continue
current_locals[k] = _safe_repr(v)
prev = self.prev_locals.get(frame_id, {})
compressed = {}
for k, v in current_locals.items():
if prev.get(k) == v:
compressed[k] = "..."
else:
compressed[k] = v
self.prev_locals[frame_id] = current_locals
record = {
"event": event,
"file": filename,
"function": frame.f_code.co_name,
"line": frame.f_lineno,
"locals": compressed,
"depth": _frame_depth(frame),
}
if event == "return":
record["return_value"] = _safe_repr(arg)
# Clean up prev_locals for this frame.
self.prev_locals.pop(frame_id, None)
elif event == "exception":
exc_type, exc_value, _ = arg
record["exception"] = {
"type": getattr(exc_type, "__name__", str(exc_type)),
"value": _safe_repr(exc_value),
}
return record
def dump(self, path):
"""Write events to a JSONL file."""
with open(path, "w") as f:
for event in self.events:
f.write(json.dumps(event) + "\n")
def to_jsonl(self):
"""Return events as a JSONL string."""
return "\n".join(json.dumps(e) for e in self.events)
def _frame_depth(frame):
depth = 0
f = frame.f_back
while f is not None:
depth += 1
f = f.f_back
return depth
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment