Skip to content

Instantly share code, notes, and snippets.

View antferdom's full-sized avatar

A.J antferdom

View GitHub Profile

Flux Dev (dynamic shapes) Benchmark

Setup

import os
import torch
from einops import rearrange
from transformers import pipeline
from flux.sampling import denoise, get_noise, get_schedule, unpack, prepare
@Chillee
Chillee / flex_attention_tutorial.py
Last active April 25, 2025 04:34
flex_attention_tutorial.py
import torch
from torch.nn.attention._flex_attention import _create_block_mask, _create_mask
from functools import partial
from torch.nn.attention._flex_attention import _flex_attention
from triton.testing import do_bench
import torch.nn.functional as F
from functools import lru_cache
torch.set_default_device('cuda')
# Example usage

Ref: Exclusive Q&A: John Carmack’s ‘Different Path’ to Artificial General Intelligence

"So I asked Ilya Sutskever, OpenAI’s chief scientist, for a reading list. He gave me a list of like 40 research papers and said, ‘If you really learn all of these, you’ll know 90% of what matters today.’ And I did. I plowed through all those things and it all started sorting out in my head."

Ref: https://x.com/ID_AA_Carmack/status/1622673143469858816

I rather expected @ilyasut to have made a public post by now after all the discussion of the AI reading list he gave me. A canonical list of references from a leading figure would be appreciated by many. I would be curious myself about what he would add from the last three years.

Papers

@geohot
geohot / llm.c
Last active May 1, 2024 13:41
Outputted llm.c from tinygrad
#include <stdlib.h>
#include <stdbool.h>
#include <tgmath.h>
#define max(x,y) ((x>y)?x:y)
#define half __fp16
void E_(int* data0) {
int val0 = data0[0];
data0[0] = (val0+1);
}
[55883.721977] amdgpu: map VA 0x702eae9d2000 - 0x702eae9d3000 in entry 0000000072d2b750
[55883.721996] amdgpu: INC mapping count 1
[55883.722133] kfd kfd: amdgpu: ioctl cmd 0xc0184b0c (#0xc), arg 0x7ffe16172bef
[55883.722238] gmc_v11_0_process_interrupt: 6 callbacks suppressed
[55883.722250] amdgpu 0000:c3:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32774, for process python3 pid 356134 thread python3 pid 356134)
[55883.722343] amdgpu 0000:c3:00.0: amdgpu: in page starting at address 0x00000000aabbc000 from client 10
[55883.722391] amdgpu 0000:c3:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00800A30
[55883.722429] amdgpu 0000:c3:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5)
[55883.722466] amdgpu 0000:c3:00.0: amdgpu: MORE_FAULTS: 0x0
[55883.722497] amdgpu 0000:c3:00.0: amdgpu: WALKER_ERROR: 0x0
@geohot
geohot / hip.py
Created November 25, 2023 23:28
Wrapper for HIP
# -*- coding: utf-8 -*-
#
# TARGET arch is: ['-D__HIP_PLATFORM_AMD__', '-I/opt/rocm/include']
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes
@geohot
geohot / memcpy.py
Created November 21, 2023 19:21
Fast memcpy using GPUs
# tiny@tiny9:~/tinygrad$ python3 examples/benchmark_copies.py
# CPU copy 6.18 ms, 16.28 GB/s
# GPU copy 4.38 ms, 23.00 GB/s
# GPU 6x 1.85 ms, 54.54 GB/s
import time
def timeit(fxn):
tms = []
for _ in range(10):
st = time.perf_counter()
@geohot
geohot / matmul.cl
Last active April 16, 2025 16:04
A 1024x1024x1024 matmul with a 2x2x2 core in OpenCL
__kernel void matmul(__global float* data0, const __global float* data1, const __global float* data2) {
int gidx0 = get_group_id(1); /* 512 */
int gidx1 = get_group_id(0); /* 512 */
float2 acc0 = (float2)(0.0f,0.0f);
float2 acc1 = (float2)(0.0f,0.0f);
for (int ridx0 = 0; ridx0 < 512; ++ridx0) {
float2 val0 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2))));
float2 val1 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2)+1024)));
float2 val2 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048))));
float2 val3 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048)+1024)));
@thecharlieblake
thecharlieblake / np2torch.py
Last active October 13, 2023 11:39
Given a numpy function, prints equivalent PyTorch code (as canonical ATen ops) and returns it as a new function.
from typing import Callable, List
import numpy as np
import torch
from torch._dynamo.backends.common import aot_autograd
from torch.fx.graph_module import GraphModule
# NOTE: requires torch >= 2.1.0
def np2torch(fn: Callable) -> Callable:
@thesephist
thesephist / gpt2_xl_perplexities.py
Created September 4, 2023 20:23
Code (most of it) for my GPT2 perplexities visualizer UI: https://twitter.com/thesephist/status/1617747154231259137
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
ppl_model_name = 'gpt2-xl' if device == 'cuda' else 'gpt2'
ppl_tokenizer = GPT2Tokenizer.from_pretrained(ppl_model_name)
load_opts = {
'device_map': 'auto',