Created
April 10, 2026 01:29
-
-
Save Helw150/771c90b1ad424cda26085706a73b33d9 to your computer and use it in GitHub Desktop.
Python execution tracer prototype for SWE-bench-style Docker images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Benchmark end-to-end trace pipeline on multiple SWE-rebench-V2 Python images.""" | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| import time | |
| IMAGES = [ | |
| {"instance_id": "wtforms__wtforms-614", "image_name": "docker.io/swerebenchv2/wtforms-wtforms:614-848d28d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_fields.py tests/test_validators.py tests/test_widgets.py"}, | |
| {"instance_id": "aio-libs__aiohttp-9047", "image_name": "docker.io/swerebenchv2/aio-libs-aiohttp:9047-aca99bc", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_connector.py tests/test_proxy.py tests/test_resolver.py"}, | |
| {"instance_id": "bvanelli__actualpy-56", "image_name": "docker.io/swerebenchv2/bvanelli-actualpy:56-2ad5f63", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_rules.py"}, | |
| {"instance_id": "keras-team__keras-19955", "image_name": "docker.io/swerebenchv2/keras-team-keras:19955-ca9519b", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning keras/src/backend/common/variables_test.py"}, | |
| {"instance_id": "aws-cloudformation__cfn-lint-3965", "image_name": "docker.io/swerebenchv2/aws-cloudformation-cfn-lint:3965-b4d790d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning test/unit/rules/functions/test_ref_format.py"}, | |
| {"instance_id": "qiskit__qiskit-terra-5662", "image_name": "docker.io/swerebenchv2/qiskit-qiskit-terra:5662-3accb1b", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning test/python/visualization/test_circuit_text_drawer.py test/python/visualization/timeline/test_core.py"}, | |
| {"instance_id": "mozilla-services__cliquet-203", "image_name": "docker.io/swerebenchv2/mozilla-services-cliquet:203-41a48da", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning cliquet/tests/test_initialization.py"}, | |
| {"instance_id": "azure__walinuxagent-970", "image_name": "docker.io/swerebenchv2/azure-walinuxagent:970-fc2451f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/common/test_errorstate.py"}, | |
| {"instance_id": "tomwhite__cubed-211", "image_name": "docker.io/swerebenchv2/tomwhite-cubed:211-4305c85", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning cubed/tests/runtime/test_lithops.py cubed/tests/runtime/test_modal_async.py cubed/tests/runtime/test_python_async.py cubed/tests/runtime/utils.py"}, | |
| {"instance_id": "pandas-dev__pandas-59608", "image_name": "docker.io/swerebenchv2/pandas-dev-pandas:59608-360597c", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning pandas/tests/reshape/merge/test_join.py pandas/tests/series/test_arrow_interface.py"}, | |
| {"instance_id": "sissbruecker__linkding-984", "image_name": "docker.io/swerebenchv2/sissbruecker-linkding:984-c5a300a", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning bookmarks/tests/test_bookmark_archived_view.py bookmarks/tests/test_bookmark_index_view.py bookmarks/tests/test_bookmark_shared_view.py"}, | |
| {"instance_id": "databricks__dbt-databricks-935", "image_name": "docker.io/swerebenchv2/databricks-dbt-databricks:935-4b1d2d9", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/functional/adapter/liquid_clustering/fixtures.py tests/functional/adapter/liquid_clustering/test_liquid_clustering.py tests/unit/macros/relations/test_table_macros.py"}, | |
| {"instance_id": "tox-dev__sphinx-autodoc-typehints-474", "image_name": "docker.io/swerebenchv2/tox-dev-sphinx-autodoc-typehints:474-c8be42f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_sphinx_autodoc_typehints.py"}, | |
| {"instance_id": "meltano__sdk-1881", "image_name": "docker.io/swerebenchv2/meltano-sdk:1881-c3a8f90", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/core/test_target_base.py"}, | |
| {"instance_id": "fsspec__universal_pathlib-148", "image_name": "docker.io/swerebenchv2/fsspec-universal_pathlib:148-2a29aa4", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning upath/tests/cases.py upath/tests/implementations/test_gcs.py upath/tests/implementations/test_hdfs.py upath/tests/implementations/test_webdav.py"}, | |
| {"instance_id": "tefra__xsdata-310", "image_name": "docker.io/swerebenchv2/tefra-xsdata:310-ce88e71", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/formats/dataclass/parsers/test_nodes.py tests/formats/dataclass/parsers/test_utils.py tests/formats/dataclass/serializers/test_xml.py tests/formats/dataclass/test_context.py tests/formats/dataclass/test_elements.py tests/models/enums/test_datatype.py"}, | |
| {"instance_id": "pyqtgraph__pyqtgraph-1845", "image_name": "docker.io/swerebenchv2/pyqtgraph-pyqtgraph:1845-ba517ab", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_colormap.py"}, | |
| {"instance_id": "pallets__werkzeug-2583", "image_name": "docker.io/swerebenchv2/pallets-werkzeug:2583-1ce57f6", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_routing.py"}, | |
| {"instance_id": "pytest-dev__pyfakefs-916", "image_name": "docker.io/swerebenchv2/pytest-dev-pyfakefs:916-95b2de3", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning pyfakefs/tests/fake_pathlib_test.py"}, | |
| {"instance_id": "cs-si__eodag-228", "image_name": "docker.io/swerebenchv2/cs-si-eodag:228-fec965f", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_end_to_end.py tests/units/test_core.py"}, | |
| ] | |
| TRACER_DIR = os.path.expanduser("~/marin/symbolic/pytracer") | |
| OUTPUT_DIR = "/tmp/tracer_bench" | |
| def run(cmd, timeout=300): | |
| return subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) | |
| def bench_one(img): | |
| instance_id = img["instance_id"] | |
| image_name = img["image_name"] | |
| test_cmd = img["test_cmd"] | |
| output_file = f"{OUTPUT_DIR}/{instance_id}.jsonl" | |
| print(f"\n{'='*60}") | |
| print(f"Instance: {instance_id}") | |
| print(f"Image: {image_name}") | |
| print(f"Test cmd: {test_cmd}") | |
| print(f"{'='*60}") | |
| timings = {} | |
| # Pull | |
| t0 = time.monotonic() | |
| r = run(f"sudo docker pull {image_name}", timeout=120) | |
| timings["pull"] = time.monotonic() - t0 | |
| if r.returncode != 0: | |
| print(f" PULL FAILED: {r.stderr[-200:]}") | |
| return None | |
| # Run tests with tracer | |
| docker_cmd = ( | |
| f"sudo docker run --rm " | |
| f"-v {TRACER_DIR}:/pytracer " | |
| f"-v {OUTPUT_DIR}:{OUTPUT_DIR} " | |
| f"-e PYTHONPATH=/pytracer " | |
| f"-e TRACER_OUTPUT={output_file} " | |
| f"-e TRACER_MAX_EVENTS=500000 " | |
| f"{image_name} {test_cmd}" | |
| ) | |
| t0 = time.monotonic() | |
| r = run(docker_cmd, timeout=300) | |
| timings["test+trace"] = time.monotonic() - t0 | |
| # Print test output tail | |
| lines = (r.stdout + r.stderr).strip().split("\n") | |
| for line in lines[-5:]: | |
| print(f" {line}") | |
| # Analyze output | |
| events = 0 | |
| chars = 0 | |
| size_mb = 0 | |
| if os.path.exists(output_file): | |
| size_mb = os.path.getsize(output_file) / 1024 / 1024 | |
| with open(output_file) as f: | |
| for line in f: | |
| events += 1 | |
| chars += len(line) | |
| os.remove(output_file) | |
| # Delete image | |
| t0 = time.monotonic() | |
| run(f"sudo docker rmi {image_name}") | |
| timings["rmi"] = time.monotonic() - t0 | |
| total = sum(timings.values()) | |
| est_tokens = chars // 4 | |
| print(f"\n Timings: pull={timings['pull']:.1f}s test+trace={timings['test+trace']:.1f}s rmi={timings['rmi']:.1f}s total={total:.1f}s") | |
| print(f" Events: {events:,} Size: {size_mb:.1f}MB Est tokens: {est_tokens:,}") | |
| return { | |
| "instance_id": instance_id, | |
| "timings": timings, | |
| "total_time": total, | |
| "events": events, | |
| "size_mb": size_mb, | |
| "est_tokens": est_tokens, | |
| } | |
| def main(): | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| print(f"Benchmarking {len(IMAGES)} unique Python repos\n") | |
| results = [] | |
| for img in IMAGES: | |
| try: | |
| result = bench_one(img) | |
| if result: | |
| results.append(result) | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| print(f"\n{'='*60}") | |
| print("SUMMARY") | |
| print(f"{'='*60}") | |
| for r in results: | |
| print(f" {r['instance_id']:45s} {r['total_time']:6.1f}s {r['events']:>8,} events {r['est_tokens']:>10,} tokens") | |
| if results: | |
| avg_time = sum(r["total_time"] for r in results) / len(results) | |
| avg_tokens = sum(r["est_tokens"] for r in results) / len(results) | |
| print(f"\n Average: {avg_time:.1f}s per image, {avg_tokens:,.0f} tokens per image") | |
| print(f" Projected for 7,243 images: {avg_time * 7243 / 3600:.1f} hours, {avg_tokens * 7243 / 1e9:.1f}B tokens") | |
| if __name__ == "__main__": | |
| t0 = time.monotonic() | |
| main() | |
| print(f"\nTotal wall time: {time.monotonic() - t0:.1f}s") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Show interesting trace events from a JSONL trace file.""" | |
| import json | |
| import sys | |
| path = sys.argv[1] if len(sys.argv) > 1 else "/tmp/tracer_out/trace.jsonl" | |
| events = [json.loads(line) for line in open(path)] | |
| print(f"Total events: {len(events)}") | |
| # Show event type distribution | |
| from collections import Counter | |
| kinds = Counter(e["event"] for e in events) | |
| print(f"Event types: {dict(kinds)}") | |
| # Show unique functions traced | |
| funcs = Counter(f"{e['file']}::{e['function']}" for e in events) | |
| print(f"\nTop 20 functions by event count:") | |
| for func, count in funcs.most_common(20): | |
| print(f" {count:5d} {func}") | |
| # Show a sample function trace (first non-module call with >3 events) | |
| seen = {} | |
| for e in events: | |
| key = (e["file"], e["function"]) | |
| if e["function"] == "<module>": | |
| continue | |
| seen.setdefault(key, []).append(e) | |
| print("\n--- Sample function traces (with locals) ---") | |
| # Group into individual invocations (call->return sequences) | |
| invocations = [] | |
| stack = {} | |
| for e in events: | |
| key = (e["file"], e["function"], e["depth"]) | |
| if e["event"] == "call": | |
| stack[key] = [e] | |
| elif key in stack: | |
| stack[key].append(e) | |
| if e["event"] == "return": | |
| invocations.append(stack.pop(key)) | |
| # Find invocations with real locals (not class defs, not trivial) | |
| shown = 0 | |
| for inv in invocations: | |
| non_ellipsis_locals = sum( | |
| 1 for e in inv for v in e["locals"].values() if v != "..." | |
| ) | |
| if (len(inv) >= 5 and non_ellipsis_locals >= 3 | |
| and inv[0]["function"] not in ("<module>", "<dictcomp>", "<listcomp>") | |
| and not inv[0]["function"][0].isupper() # skip class defs | |
| and shown < 3): | |
| print(f"\nFunction: {inv[0]['function']} in {inv[0]['file']} ({len(inv)} events)") | |
| for e in inv[:25]: | |
| print(json.dumps(e)) | |
| if len(inv) > 25: | |
| print(f" ... ({len(inv) - 25} more events)") | |
| shown += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Auto-inject tracer when imported via PYTHONPATH. | |
| Activated by setting PYTHONPATH to include the directory containing this file. | |
| Configure via environment variables: | |
| TRACER_OUTPUT - path to write JSONL output (default: /tmp/trace.jsonl) | |
| TRACER_REPO_ROOT - repo root for relative paths (default: cwd) | |
| TRACER_MAX_EVENTS - max events before stopping (default: 50000) | |
| TRACER_MAX_REPR - max repr length per variable (default: 200) | |
| """ | |
| import atexit | |
| import os | |
| import sys | |
| def _setup_tracer(): | |
| from tracer import Tracer | |
| repo_root = os.environ.get("TRACER_REPO_ROOT", os.getcwd()) | |
| output_path = os.environ.get("TRACER_OUTPUT", "/tmp/trace.jsonl") | |
| tracer = Tracer(repo_root=repo_root) | |
| tracer.start() | |
| def _on_exit(): | |
| tracer.stop() | |
| tracer.dump(output_path) | |
| count = len(tracer.events) | |
| print(f"\n[tracer] Wrote {count} events to {output_path}", file=sys.stderr) | |
| atexit.register(_on_exit) | |
| _setup_tracer() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Execution tracer that captures Python program state at interpreter events. | |
| Uses sys.settrace to record call/line/return/exception events with local | |
| variable snapshots. Filters to repo code only (skips stdlib, site-packages). | |
| Outputs JSONL with compressed unchanged variables. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import threading | |
| from pathlib import Path | |
| # Paths that indicate non-repo code. | |
| _SKIP_MARKERS = ( | |
| "site-packages", | |
| "lib/python", | |
| "/usr/lib", | |
| "importlib", | |
| "<frozen", | |
| "/pytracer/", | |
| "/.local/", | |
| "/usr/local/lib/", | |
| "_pytest/", | |
| "pluggy/", | |
| "conftest.py", | |
| "_distutils_hack", | |
| "pkg_resources", | |
| ) | |
| # Max events before we stop tracing (safety valve). | |
| MAX_EVENTS = int(os.environ.get("TRACER_MAX_EVENTS", "50000")) | |
| # Max repr length for a single variable value. | |
| MAX_REPR_LEN = int(os.environ.get("TRACER_MAX_REPR", "200")) | |
| def _safe_repr(obj): | |
| """repr() that never raises and truncates long values.""" | |
| try: | |
| r = repr(obj) | |
| except Exception: | |
| return "<repr-error>" | |
| if len(r) > MAX_REPR_LEN: | |
| return r[:MAX_REPR_LEN] + "..." | |
| return r | |
| def _should_skip(filename): | |
| if not filename: | |
| return True | |
| for marker in _SKIP_MARKERS: | |
| if marker in filename: | |
| return True | |
| return False | |
| class Tracer: | |
| """Captures execution traces as JSONL. | |
| Usage: | |
| tracer = Tracer(repo_root="/path/to/repo") | |
| tracer.start() | |
| # ... run code ... | |
| tracer.stop() | |
| tracer.dump("/path/to/output.jsonl") | |
| """ | |
| def __init__(self, repo_root=None): | |
| self.repo_root = str(Path(repo_root).resolve()) if repo_root else None | |
| self.events = [] | |
| self.prev_locals = {} # keyed by frame id | |
| self._lock = threading.Lock() | |
| self._stopped = False | |
| def start(self): | |
| self._stopped = False | |
| sys.settrace(self._trace) | |
| threading.settrace(self._trace) | |
| def stop(self): | |
| self._stopped = True | |
| sys.settrace(None) | |
| threading.settrace(None) | |
| def _trace(self, frame, event, arg): | |
| try: | |
| if self._stopped or len(self.events) >= MAX_EVENTS: | |
| return None | |
| filename = frame.f_code.co_filename | |
| if _should_skip(filename): | |
| return None | |
| record = self._build_record(frame, event, arg) | |
| if record: | |
| with self._lock: | |
| self.events.append(record) | |
| return self._trace | |
| except Exception: | |
| return self._trace | |
| def _build_record(self, frame, event, arg): | |
| filename = frame.f_code.co_filename | |
| if self.repo_root: | |
| filename = os.path.relpath(filename, self.repo_root) | |
| frame_id = id(frame) | |
| # Snapshot locals, compress unchanged. | |
| current_locals = {} | |
| for k, v in frame.f_locals.items(): | |
| if k.startswith("__") and k.endswith("__"): | |
| continue | |
| current_locals[k] = _safe_repr(v) | |
| prev = self.prev_locals.get(frame_id, {}) | |
| compressed = {} | |
| for k, v in current_locals.items(): | |
| if prev.get(k) == v: | |
| compressed[k] = "..." | |
| else: | |
| compressed[k] = v | |
| self.prev_locals[frame_id] = current_locals | |
| record = { | |
| "event": event, | |
| "file": filename, | |
| "function": frame.f_code.co_name, | |
| "line": frame.f_lineno, | |
| "locals": compressed, | |
| "depth": _frame_depth(frame), | |
| } | |
| if event == "return": | |
| record["return_value"] = _safe_repr(arg) | |
| # Clean up prev_locals for this frame. | |
| self.prev_locals.pop(frame_id, None) | |
| elif event == "exception": | |
| exc_type, exc_value, _ = arg | |
| record["exception"] = { | |
| "type": getattr(exc_type, "__name__", str(exc_type)), | |
| "value": _safe_repr(exc_value), | |
| } | |
| return record | |
| def dump(self, path): | |
| """Write events to a JSONL file.""" | |
| with open(path, "w") as f: | |
| for event in self.events: | |
| f.write(json.dumps(event) + "\n") | |
| def to_jsonl(self): | |
| """Return events as a JSONL string.""" | |
| return "\n".join(json.dumps(e) for e in self.events) | |
| def _frame_depth(frame): | |
| depth = 0 | |
| f = frame.f_back | |
| while f is not None: | |
| depth += 1 | |
| f = f.f_back | |
| return depth |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment