sayakpaul · June 3, 2025 16:09 · Manojbhat09 · Feb 2, 2025 · sayakpaul · Feb 2, 2025
diff --git a/aot_compile_with_int8_quant.py b/aot_compile_with_int8_quant.py
 import torch
 from diffusers import FluxTransformer2DModel
 import torch.utils.benchmark as benchmark
 from torchao.quantization import quantize_, int8_weight_only
 from torchao.utils import unwrap_tensor_subclass
 import torch._inductor

 torch._inductor.config.mixed_mm_choice = "triton"


 def get_example_inputs():
    example_inputs = torch.load("serialized_inputs.pt", weights_only=True)
    example_inputs = {k: v.to("cuda") for k, v in example_inputs.items()}
    example_inputs.update({"joint_attention_kwargs": None, "return_dict": False})
    return example_inputs


 def benchmark_fn(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)",
        globals={"args": args, "kwargs": kwargs, "f": f},
        num_threads=torch.get_num_threads(),
    )
    return f"{(t0.blocked_autorange().mean):.3f}"


 @torch.no_grad()
 def load_model():
    model = FluxTransformer2DModel.from_pretrained(
        "black-forest-labs/FLUX.1-dev", subfolder="transformer", torch_dtype=torch.bfloat16
    ).to("cuda")
    return model


 def aot_compile(name, model, **sample_kwargs):
    path = f"./{name}.pt2"
    options = {
        "max_autotune": True,
        "triton.cudagraphs": True,
    }
    return torch._inductor.aoti_compile_and_package(
        torch.export.export(model, (), sample_kwargs),
        (),
        sample_kwargs,
        package_path=path,
        inductor_configs=options,
    )


 def aot_load(path):
    return torch._inductor.aoti_load_package(path)


 @torch.no_grad()
 def f(model, **kwargs):
    return model(**kwargs)


 if __name__ == "__main__":
    model = load_model()
    quantize_(model, int8_weight_only())
    inputs1 = get_example_inputs()
    unwrap_tensor_subclass(model)

    path = aot_compile("bs_1_1024", model, **inputs1)
    print(f"AoT compiled path {path}")

    compiled_func = aot_load(path)
    print(f"{compiled_func(**inputs1)[0].shape=}")

    for _ in range(5):
        _ = compiled_func(**inputs1)[0]

    time = benchmark_fn(f, compiled_func, **inputs1)
    print(f"{time=} seconds.")
diff --git a/inference.py b/inference.py
 import torch
 from diffusers import DiffusionPipeline


 pipeline = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=None,
    torch_dtype=torch.bfloat16,
 ).to("cuda")
 pipeline.transformer = torch._inductor.aoti_load_package("./bs_1_1024.pt2")

 image = pipeline("cute dog", guidance_scale=3.5, max_sequence_length=512, num_inference_steps=50).images[0]
 image.save("aot_compiled.png")
diff --git a/notes.md b/notes.md
	import torch
	from diffusers import FluxTransformer2DModel
	import torch.utils.benchmark as benchmark
	from torchao.quantization import quantize_, int8_weight_only
	from torchao.utils import unwrap_tensor_subclass
	import torch._inductor

	torch._inductor.config.mixed_mm_choice = "triton"


	def get_example_inputs():
	example_inputs = torch.load("serialized_inputs.pt", weights_only=True)
	example_inputs = {k: v.to("cuda") for k, v in example_inputs.items()}
	example_inputs.update({"joint_attention_kwargs": None, "return_dict": False})
	return example_inputs


	def benchmark_fn(f, args, *kwargs):
	t0 = benchmark.Timer(
	stmt="f(args, *kwargs)",
	globals={"args": args, "kwargs": kwargs, "f": f},
	num_threads=torch.get_num_threads(),
	)
	return f"{(t0.blocked_autorange().mean):.3f}"


	@torch.no_grad()
	def load_model():
	model = FluxTransformer2DModel.from_pretrained(
	"black-forest-labs/FLUX.1-dev", subfolder="transformer", torch_dtype=torch.bfloat16
	).to("cuda")
	return model


	def aot_compile(name, model, **sample_kwargs):
	path = f"./{name}.pt2"
	options = {
	"max_autotune": True,
	"triton.cudagraphs": True,
	}
	return torch._inductor.aoti_compile_and_package(
	torch.export.export(model, (), sample_kwargs),
	(),
	sample_kwargs,
	package_path=path,
	inductor_configs=options,
	)


	def aot_load(path):
	return torch._inductor.aoti_load_package(path)


	@torch.no_grad()
	def f(model, **kwargs):
	return model(**kwargs)


	if __name__ == "__main__":
	model = load_model()
	quantize_(model, int8_weight_only())
	inputs1 = get_example_inputs()
	unwrap_tensor_subclass(model)

	path = aot_compile("bs_1_1024", model, **inputs1)
	print(f"AoT compiled path {path}")

	compiled_func = aot_load(path)
	print(f"{compiled_func(**inputs1)[0].shape=}")

	for _ in range(5):
	_ = compiled_func(**inputs1)[0]

	time = benchmark_fn(f, compiled_func, **inputs1)
	print(f"{time=} seconds.")
	import torch
	from diffusers import DiffusionPipeline


	pipeline = DiffusionPipeline.from_pretrained(
	"black-forest-labs/FLUX.1-dev",
	transformer=None,
	torch_dtype=torch.bfloat16,
	).to("cuda")
	pipeline.transformer = torch._inductor.aoti_load_package("./bs_1_1024.pt2")

	image = pipeline("cute dog", guidance_scale=3.5, max_sequence_length=512, num_inference_steps=50).images[0]
	image.save("aot_compiled.png")