Skip to content

Instantly share code, notes, and snippets.

ckpt_id gemm backend batch_size fuse compile compile_vae quantization sparsify model_memory inference_memory time
black-forest-labs/FLUX.1-dev N/A 1 False False False fp8dqrow False 20.367 31.817 9.578
black-forest-labs/FLUX.1-dev triton,aten 1 False True False fp8dqrow False 20.367 31.817 4.165
black-forest-labs/FLUX.1-dev cutlass,aten 16 False True False fp8dqrow False 20.367 50.471 60.734
black-forest-labs/FLUX.1-dev cutlass_no_fast_accum, aten 16 False True False fp8dq False
import torch
from torch import nn
from torch.nn import functional as F
from triton.testing import do_bench
import triton
import triton.language as tl
import torch._inductor.config as config
from torch._inductor.utils import fresh_inductor_cache
#torch._logging.set_logs(autotuning=True)
@mlazos
mlazos / bad.py
Created May 29, 2025 20:04
topological visitor bug standalone
from typing import overload
import torch
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
try_import_cutlass() # comment this out if you have cutlass installed
import cutlass
import types
import ast
import textwrap
import inspect
from cutlass.epilogue import relu
@mlazos
mlazos / bad.txt
Last active May 29, 2025 19:57
topological visitor bug
cutlass_fused_add_mm_relu_1dd8740c = async_compile.cuda(r'''
#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <vector>
#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
from typing import overload
import torch
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
try_import_cutlass() # comment this out if you have cutlass installed
import cutlass
import types
import ast
import textwrap
import inspect
from cutlass.epilogue import relu
#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <vector>
#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/tensor_ref.h"
`--> TORCH_LOGS="+inductor" python test/inductor/test_cutlass_backend.py -k test_evt_fusions_basic_mul_shape0
[INFO]:compile_threads set to 32
[INFO]:Creating 'subprocess' pool with 32 workers
[INFO]:compile_threads set to 32
> /data/users/mlazos/pytorch/test/inductor/test_cutlass_backend.py(129)run_evt_test()
-> M, N = shape
(Pdb) c
[DEBUG]:TRACED GRAPH
===== BEFORE PRE GRAD =====
/data/users/mlazos/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
from typing import overload
import torch
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
try_import_cutlass() # comment this out if you have cutlass installed
import cutlass
import types
import ast
import textwrap
import inspect
from cutlass.epilogue import relu
from typing import overload
import torch
from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
try_import_cutlass() # comment this out if you have cutlass installed
import cutlass
import types
import ast
import textwrap
import inspect
from cutlass.epilogue import relu
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
using ElementD = cutlass::half_t;
using StrideD = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
using Bias = cutlass::epilogue::fusion::Sm90ColBroadcast<
0 /*Stages*/, typename EpilogueDescriptor::TileShape, cutlass::half_t, cutlass::half_t,
cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>
>;