Skip to content

Instantly share code, notes, and snippets.

@Helw150
Helw150 / bench_trace.py
Created April 10, 2026 01:29
Python execution tracer prototype for SWE-bench-style Docker images
"""Benchmark end-to-end trace pipeline on multiple SWE-rebench-V2 Python images."""
import json
import os
import subprocess
import sys
import time
IMAGES = [
{"instance_id": "wtforms__wtforms-614", "image_name": "docker.io/swerebenchv2/wtforms-wtforms:614-848d28d", "test_cmd": "pytest --no-header -rA --tb=line --color=no -p no:cacheprovider -W ignore::DeprecationWarning tests/test_fields.py tests/test_validators.py tests/test_widgets.py"},
@Helw150
Helw150 / stats.csv
Created March 30, 2026 16:52
Token Counts
dataset marin_tokens category
import gradio as gr
import math
import numpy as np
import time
import io
import wave
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
# This will create a wave header then append the frame input
from time import sleep
from datasets import load_dataset
from huggingface_hub import InferenceClient
from ratelimit import limits, sleep_and_retry
from transformers import AutoTokenizer
dataset = load_dataset("yijingwu/HeySQuAD_human", split="train")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
text = # Tokenized Text Corresponding to Recording Transcript
audio = # Mel Spectrogram of the Recording
# Only Train Connector and Projection
self.encoder.freeze()
self.llama.freeze()
# Convert Raw Audio Signal to 1500 Embeddings with Whisper Encoder (CNN+Transformer)
audio_features = self.encoder(audio)
def _push_parquet_shards_to_hub( [1071/1877]
self,
repo_id: str,
data_dir: str = "data",
split: Optional[str] = None,
token: Optional[str] = None,
revision: Optional[str] = None,
create_pr: Optional[bool] = False,
max_shard_size: Optional[Union[int, str]] = None,
num_shards: Optional[int] = None,
import ast
# To Delete After Debug
import code
import copyreg
import datetime
import functools
import json
import os
import re
@Helw150
Helw150 / ot_loss.py
Last active April 27, 2023 22:02
OT TADA Loss
from typing import List, Optional, Tuple, Union
from torchtyping import TensorType
from transformers.adapters.modeling import Adapter
from transformers.adapters import (
BartAdapterModel,
RobertaAdapterModel,
BertAdapterModel,
AdapterConfig,
)
@Helw150
Helw150 / parallel_t5.py
Last active May 10, 2023 14:52
Flan T5 Parallel Usage
from transformers import AutoTokenizer, T5ForConditionalGeneration
# Model Init
n_gpu = 8
tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2")
heads_per_gpu = len(model.encoder.block) // n_gpu
device_map = {
gpu: list(
range(
@Helw150
Helw150 / upload_csv.py
Created September 16, 2022 15:50
Lab Meeting Dataset upload Code
# See https://huggingface.co/docs/datasets/upload_dataset for more details
from datasets import load_dataset
dataset_name = "PUT_YOUR_NAME_HERE"
data_files = {"train": "train.csv", "dev": "dev.csv", "test": "test.csv"}
dataset = load_dataset("namespace/your_dataset_name", data_files=data_files)
datasets.push_to_hub(f"SALT-NLP/{dataset_name}", private=True)