mlazos’s gists

mlazos / results.md

Last active June 12, 2025 17:32

ckpt_id	gemm backend	batch_size	fuse	compile	compile_vae	quantization	sparsify	model_memory	inference_memory	time
black-forest-labs/FLUX.1-dev	N/A	1	False	False	False	fp8dqrow	False	20.367	31.817	9.578
black-forest-labs/FLUX.1-dev	triton,aten	1	False	True	False	fp8dqrow	False	20.367	31.817	4.165
black-forest-labs/FLUX.1-dev	cutlass,aten	16	False	True	False	fp8dqrow	False	20.367	50.471	60.734
black-forest-labs/FLUX.1-dev	cutlass_no_fast_accum, aten	16	False	True	False	fp8dq	False

mlazos / gist:43a2702d17ae3791d08628ef234308b3

Created June 2, 2025 22:20

	import torch
	from torch import nn
	from torch.nn import functional as F
	from triton.testing import do_bench
	import triton
	import triton.language as tl
	import torch._inductor.config as config
	from torch._inductor.utils import fresh_inductor_cache

	#torch._logging.set_logs(autotuning=True)

mlazos / bad.py

Created May 29, 2025 20:04

topological visitor bug standalone

	from typing import overload
	import torch
	from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
	try_import_cutlass() # comment this out if you have cutlass installed
	import cutlass
	import types
	import ast
	import textwrap
	import inspect
	from cutlass.epilogue import relu

mlazos / bad.txt

Last active May 29, 2025 19:57

topological visitor bug

	cutlass_fused_add_mm_relu_1dd8740c = async_compile.cuda(r'''
	#include <exception>
	#include <iostream>
	#include <memory>
	#include <random>
	#include <vector>

	#include "cute/tensor.hpp"
	#include "cutlass/cutlass.h"
	#include "cutlass/numeric_types.h"

mlazos / no_lca.py

Created May 16, 2025 22:36

	from typing import overload
	import torch
	from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
	try_import_cutlass() # comment this out if you have cutlass installed
	import cutlass
	import types
	import ast
	import textwrap
	import inspect
	from cutlass.epilogue import relu

mlazos / kernel1.cu

Last active May 15, 2025 03:59

	#include <exception>
	#include <iostream>
	#include <memory>
	#include <random>
	#include <vector>

	#include "cute/tensor.hpp"
	#include "cutlass/cutlass.h"
	#include "cutlass/numeric_types.h"
	#include "cutlass/tensor_ref.h"

mlazos / gist:f6a8bf58a82d6b948a2a976bf431562b

Created May 15, 2025 03:50

	`--> TORCH_LOGS="+inductor" python test/inductor/test_cutlass_backend.py -k test_evt_fusions_basic_mul_shape0
	[INFO]:compile_threads set to 32
	[INFO]:Creating 'subprocess' pool with 32 workers
	[INFO]:compile_threads set to 32
	> /data/users/mlazos/pytorch/test/inductor/test_cutlass_backend.py(129)run_evt_test()
	-> M, N = shape
	(Pdb) c
	[DEBUG]:TRACED GRAPH
	===== BEFORE PRE GRAD =====
	/data/users/mlazos/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):

mlazos / repro.py

Created May 12, 2025 21:53

	from typing import overload
	import torch
	from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
	try_import_cutlass() # comment this out if you have cutlass installed
	import cutlass
	import types
	import ast
	import textwrap
	import inspect
	from cutlass.epilogue import relu

mlazos / codegen.py

Created May 12, 2025 21:35

	from typing import overload
	import torch
	from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
	try_import_cutlass() # comment this out if you have cutlass installed
	import cutlass
	import types
	import ast
	import textwrap
	import inspect
	from cutlass.epilogue import relu

mlazos / code.cpp

Created May 12, 2025 21:34

	using Accum = cutlass::epilogue::fusion::Sm90AccFetch;

	using ElementD = cutlass::half_t;
	using StrideD = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;

	using Bias = cutlass::epilogue::fusion::Sm90ColBroadcast<
	0 /Stages/, typename EpilogueDescriptor::TileShape, cutlass::half_t, cutlass::half_t,
	cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>
	>;

Michael Lazos mlazos