Skip to content

Instantly share code, notes, and snippets.

import torch
# CUDA kernel with inline PTX
kernel_source = """
__global__ void vector_add(const float* a, const float* b, float* c, int n) {
int idx;
asm("mov.u32 %0, %%ctaid.x;" : "=r"(idx));
int tid;
asm("mov.u32 %0, %%tid.x;" : "=r"(tid));
int ntid;
import torch
from torch import nn
from torch.distributed.tensor.placement_types import Replicate, Shard
from torch.testing._internal.distributed.fake_pg import FakeStore
import torch.distributed as dist
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DTensor, Replicate
world_size = 4
@Chillee
Chillee / mm_weird.py
Last active April 3, 2025 11:26
Strangely, Matrix Multiplications Run Faster When Given "Predictable" Data! https://www.thonking.ai/p/strangely-matrix-multiplications
import torch
torch.set_default_device('cuda')
from triton.testing import do_bench
from collections import defaultdict
from functools import partial
import random
random.seed(0)
def get_flops(A, B):
ms = do_bench(lambda: torch.mm(A, B))
"""
Augment kernel metadata generated by kernel_metadata metric table in inductor.
For each row in input, use NCU to profile the kernel. The corresponding output row
contains more metadata gathered by NCU.
It can be super slow to run NCU. e.g. for the 10K kernels gathered from Huggingface,
it took almost a whole day to run NCU for each unique kernels. The script thus cache
the ncu output in the file system. If the ncu output is cached, we don't run NCU again.
Example input: https://gist.github.com/shunting314/22995da0da8b66d4cf989cb7f0508399
@fishmingyu
fishmingyu / GAT breakdown
Last active December 8, 2023 10:43
GNN breakdown profiling [PT2.0 compiler]
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1
graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1
graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1
aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3
@fishmingyu
fishmingyu / gather_scatter_fusion.py
Last active December 8, 2023 10:31
Gather Scatter fusion in PyG by Inductor
import torch
import torch_geometric
from torch_geometric.profile import benchmark
from torch_geometric.testing import (
disableExtensions,
onlyFullTest,
onlyLinux,
withCUDA,
withPackage,
@Chillee
Chillee / 1-pw_op_fusion.py
Last active August 21, 2025 19:00
PT 2.0 Benchmarks
import torch
import torch._inductor.config
import time
torch._inductor.config.triton.cudagraphs = False
torch.set_float32_matmul_precision('high')
def bench(f, name=None, iters=100, warmup=5, display=True, profile=False):
for _ in range(warmup):
f()
@ivan-loh
ivan-loh / compile-install-emacs.sh
Last active March 31, 2024 05:06
Compile and install emacs 28.1 for Ubuntu
#!/bin/bash
sudo apt install build-essential texinfo libx11-dev libxpm-dev libjpeg-dev libpng-dev libgif-dev libtiff-dev libgtk2.0-dev libncurses-dev libgnutls28-dev
wget http://ftp.gnu.org/gnu/emacs/emacs-29.3.tar.gz
tar xvzf emacs-29.3.tar.gz
cd emacs-29.3
./configure --with-mailutils
make
@Borda
Borda / changelog_parser_PR_authors.py
Last active December 20, 2021 16:36
use Fire CLI
"""
Parse all users from release notes according to PR reference.
Resources:
- https://developer.github.com/v3/pulls/
- http://zetcode.com/python/requests/
- https://developer.github.com/v3/#authentication
"""
import json
@kinu
kinu / how_i_got_into_google.md
Last active January 16, 2025 08:17
Google に入るまでの話

Google に入ったときの話 (Kinuko)

祭っぽいので私も書いてみることにした!お手軽に gist で。

コンテキスト:https://togetter.com/li/1331865

対策とか(特になし)

と書き出したはいいが、私が受けたときは本も情報もあまりなく、かつ競プロ的なものの存在も知らなかったので、とりあえず家にあったアルゴリズムの本を2回くらい読み直した。そして受かった。最初っから情報があまりない方のパターンで申し訳ない 😄