FA3 attention processor comes from https://gist.github.com/sayakpaul/ff715f979793d4d44beb68e5e08ee067.
Results from an H100:
latency=36.606 seconds. (AoT regional compilation)
latency=36.555 seconds. (JiT regional compilation)| import torch | |
| from diffusers import DiffusionPipeline | |
| import spaces | |
| from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights | |
| from time import perf_counter | |
| import argparse | |
| CKPT_ID = "black-forest-labs/Flux.1-Dev" |
FA3 attention processor comes from https://gist.github.com/sayakpaul/ff715f979793d4d44beb68e5e08ee067.
Results from an H100:
latency=36.606 seconds. (AoT regional compilation)
latency=36.555 seconds. (JiT regional compilation)| # Make sure you are using the latest `bitsandbytes` (at least 0.46.0) and PyTorch nightlies (at least 2.8). | |
| # Put together by sayakpaul and anijain2305 | |
| from diffusers.quantizers import PipelineQuantizationConfig | |
| from diffusers import FluxPipeline | |
| import argparse | |
| import json | |
| import torch | |
| import time | |
| from functools import partial |
| from diffusers import DiffusionPipeline | |
| import torch.utils.benchmark as benchmark | |
| import torch | |
| import psutil | |
| import os | |
| import json | |
| import argparse | |
| def benchmark_fn(f, *args, **kwargs): | |
| t0 = benchmark.Timer( |
| from google import genai | |
| from google.genai import types | |
| import typing_extensions as typing | |
| from PIL import Image | |
| import requests | |
| import io | |
| import json | |
| import os |
| """ | |
| Implementation of the label generation part in https://danielvanstrien.xyz/posts/2025/deepseek/distil-deepseek-modernbert.html | |
| using `transformers` and DeepSeek. | |
| """ | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| import re | |
| import contextlib | |
| import math |
| from moviepy.editor import VideoFileClip, clips_array | |
| import glob | |
| def create_video_collage(video_paths, output_path="collage.mp4"): | |
| """ | |
| Combine four videos of the same resolution into a 2×2 collage. | |
| Args: | |
| video_paths (list[str]): List of paths to the four video files. | |
| output_path (str): Filename for the output collage video. |
| from diffusers import DiffusionPipeline | |
| from diffusers import FluxTransformer2DModel, BitsAndBytesConfig | |
| from transformers import T5EncoderModel, BitsAndBytesConfig as BnbConfig | |
| from offloader import ModelOffloaderV2 | |
| import torch.utils.benchmark as benchmark | |
| from pathlib import Path | |
| import os | |
| import sys | |
| import torch | |
| import json |
| import torch | |
| from diffusers.utils import export_to_video | |
| from diffusers import LTXPipeline, LTXVideoTransformer3DModel, GGUFQuantizationConfig | |
| ckpt_path = ( | |
| "https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf" | |
| ) | |
| transformer = LTXVideoTransformer3DModel.from_single_file( | |
| ckpt_path, | |
| quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16), |
| import torch | |
| from diffusers import FluxTransformer2DModel | |
| import torch.utils.benchmark as benchmark | |
| from torchao.quantization import quantize_, int8_weight_only | |
| from torchao.utils import unwrap_tensor_subclass | |
| import torch._inductor | |
| torch._inductor.config.mixed_mm_choice = "triton" |