Skip to content

Instantly share code, notes, and snippets.

View YouJiacheng's full-sized avatar

You Jiacheng YouJiacheng

  • IIIS, Tsinghua University
  • Beijing, China
View GitHub Profile
@YouJiacheng
YouJiacheng / pyproject.toml
Created May 6, 2025 04:12
code for repro regression, data available in: https://github.com/KellerJordan/modded-nanogpt/
[project]
name = "modded-nanogpt"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = "==3.12.*"
dependencies = [
"numpy>=2.1.3",
"torch",
"pytorch-triton",
from torch._logging._internal import trace_structured # noqa: E402
import torch._inductor.codecache # noqa: E402
import torch._inductor.graph # noqa: E402
def _patched_trace_structured(name, *args, **kwargs):
if name == "inductor_output_code":
match args, kwargs:
case (metadata_fn, *_), _:
filename = metadata_fn().get("filename", "Unknown")
case _, {"metadata_fn": metadata_fn}:
filename = metadata_fn().get("filename", "Unknown")
from functools import partial
import jax
import jax.numpy as jnp
import optax
def poly(x: jnp.ndarray, w: jnp.ndarray):
assert w.shape == (3,)
w = w.astype(jnp.float32)
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import time
import uuid
from dataclasses import dataclass
from functools import lru_cache, partial
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import time
import uuid
from dataclasses import dataclass
from functools import lru_cache, partial
import os
import sys
from typing import override
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import contextlib
import time
import uuid
from dataclasses import dataclass
@dataclass
class Args:
vocab_size: int = 129280
dim: int = 7168
inter_dim: int = 18432
moe_inter_dim: int = 2048
n_layers: int = 61
def abs_cdf(t: Tensor, thresholds: list[float]):
t = t.abs()
level = torch.bucketize(t, t.new_tensor(thresholds), out_int32=True) # sum(x > v for v in thresholds)
return level.flatten().bincount(minlength=len(thresholds) + 1).cumsum(0) / t.numel()
# reference: https://github.com/pytorch/pytorch/issues/69519#issuecomment-2500366519
def histogram(input: Tensor, bins: Tensor, *, weight: Optional[Tensor] = None, density: bool = False):
bucket_indices = torch.bucketize(input, bins)
counts = torch.bincount(bucket_indices, weights=weight, minlength=bins.size(0)+1)
counts = counts[1:-1]
@YouJiacheng
YouJiacheng / rope_shift.py
Created November 24, 2024 16:57
rope shift
import torch
import torch.nn as nn
import torch.nn.functional as F
class RoPE(nn.Module):
def __init__(
self,
dim,
max_seq_len: int = 4096,
import os
import sys
import torch._dynamo.compiled_autograd
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import glob
import time