Created
June 20, 2024 09:38
-
-
Save Multihuntr/89afde471e7caa949b4a89653e896076 to your computer and use it in GitHub Desktop.
Run Stable Diffusion 3 from commandline with limited resources.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: sb3 | |
channels: | |
- pytorch | |
- nvidia | |
- conda-forge | |
dependencies: | |
- diffusers | |
- transformers | |
- pytorch | |
- torchvision | |
- pytorch-cuda=12.1 # Note: I am using 12.2, but pytorch-cuda doesn't have a version for that; seems to work anyway | |
- accelerate | |
- sentencepiece | |
- protobuf | |
- pip: | |
# The conda-forge version wasn't registering my CUDA for some reason | |
- https://github.com/TimDettmers/bitsandbytes/releases/download/0.43.1/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based off of: https://gist.github.com/sayakpaul/82acb5976509851f2db1a83456e504f1 | |
# Adds argparser, splits out vae as separate step for VRAM savings. | |
# Removes performance monitoring. | |
# Making this a script for use, rather than measuring performance. | |
# Takes about 4 minutes per image using default settings for my GTX 1060 | |
import argparse | |
import gc | |
from diffusers import StableDiffusion3Pipeline, SD3Transformer2DModel | |
from transformers import T5EncoderModel, BitsAndBytesConfig | |
import torch | |
def flush(): | |
gc.collect() | |
torch.cuda.empty_cache() | |
def parse_args(): | |
parser = argparse.ArgumentParser('Generate SB3-medium image using TB5 with 5GB VRAM and 12GB RAM') | |
parser.add_argument('prompt', type=str) | |
parser.add_argument('--neg_prompt', type=str, default=None) | |
parser.add_argument('--pooled_prompt', type=str, default=None) | |
parser.add_argument('--model_id', type=str, default="stabilityai/stable-diffusion-3-medium-diffusers") | |
parser.add_argument('--steps', type=int, default=28) | |
parser.add_argument('--num_generate', '-n', type=int, default=1) | |
parser.add_argument('--out_name', type=str, default='output') | |
return parser.parse_args() | |
def main(args): | |
for i in range(args.num_generate): | |
filename = f'{args.out_name}_{i}.png' | |
prompt_embeds = embed_prompt(args.model_id, args.prompt, args.neg_prompt, args.pooled_prompt) | |
generate(args.model_id, prompt_embeds, args.steps, filename) | |
def embed_prompt(model_id, prompt, neg_prompt, pooled_prompt): | |
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) | |
# A little unnecessary, since the main cost is generating, | |
# but, might as well use it if we've got it | |
device_map = { | |
'shared': 0, | |
'encoder.embed_tokens': 0, | |
'encoder.block.0': 0, | |
'encoder.block.1': 0, | |
'encoder.block.2': 0, | |
'encoder.block.3': 0, | |
'encoder.block.4': 0, | |
'encoder.block.5': 0, | |
'encoder.block.6': 0, | |
'encoder.block.7': 0, | |
'encoder.block.8': 0, | |
'encoder.block.9': 'cpu', | |
'encoder.block.10': 'cpu', | |
'encoder.block.11': 'cpu', | |
'encoder.block.12': 'cpu', | |
'encoder.block.13': 'cpu', | |
'encoder.block.14': 'cpu', | |
'encoder.block.15': 'cpu', | |
'encoder.block.16': 'cpu', | |
'encoder.block.17': 'cpu', | |
'encoder.block.18': 'cpu', | |
'encoder.block.19': 'cpu', | |
'encoder.block.20': 'cpu', | |
'encoder.block.21': 'cpu', | |
'encoder.block.22': 'cpu', | |
'encoder.block.23': 0, | |
'encoder.final_layer_norm': 0, | |
'encoder.dropout': 0, | |
} | |
text_encoder = T5EncoderModel.from_pretrained( | |
model_id, | |
subfolder="text_encoder_3", | |
quantization_config=quantization_config, | |
device_map=device_map | |
) | |
pipeline = StableDiffusion3Pipeline.from_pretrained( | |
model_id, | |
text_encoder_3=text_encoder, | |
transformer=None, | |
vae=None, | |
device_map='balanced' | |
) | |
( | |
prompt_embeds, | |
negative_prompt_embeds, | |
pooled_prompt_embeds, | |
negative_pooled_prompt_embeds, | |
) = pipeline.encode_prompt(prompt=prompt, prompt_2=neg_prompt, prompt_3=pooled_prompt) | |
del text_encoder | |
del pipeline | |
flush() | |
return ( | |
prompt_embeds.cuda().half(), | |
negative_prompt_embeds.cuda().half(), | |
pooled_prompt_embeds.cuda().half(), | |
negative_pooled_prompt_embeds.cuda().half(), | |
) | |
def generate(model_id, prompt_embeds, steps, filename): | |
pipeline = StableDiffusion3Pipeline.from_pretrained( | |
model_id, | |
text_encoder=None, | |
text_encoder_2=None, | |
text_encoder_3=None, | |
tokenizer=None, | |
tokenizer_2=None, | |
tokenizer_3=None, | |
torch_dtype=torch.float16 | |
).to("cuda") | |
( | |
prompt_embeds, | |
negative_prompt_embeds, | |
pooled_prompt_embeds, | |
negative_pooled_prompt_embeds, | |
) = prompt_embeds | |
latents = pipeline( | |
prompt_embeds=prompt_embeds, | |
negative_prompt_embeds=negative_prompt_embeds, | |
pooled_prompt_embeds=pooled_prompt_embeds, | |
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, | |
num_inference_steps=steps, | |
guidance_scale=5.0, | |
output_type='latent' | |
).images | |
# Don't have enough RAM to run VAE at the end, too :'( | |
# So, I'm copy-pasting the last few lines of the pipeline call (as discovered through inspect.getsource) | |
# deleting the transformer to clear space on the GPU before running the VAE | |
pipeline_vae = pipeline.vae | |
pipeline_image_processor = pipeline.image_processor | |
scale, shift = pipeline.vae.config.scaling_factor, pipeline.vae.config.shift_factor | |
del pipeline | |
flush() | |
latents = (latents / scale) + shift | |
images = pipeline_vae.decode(latents, return_dict=False)[0] | |
images = pipeline_image_processor.postprocess(images, output_type='pil') | |
images[0].save(filename) | |
del pipeline_vae | |
del pipeline_image_processor | |
flush() | |
if __name__ == '__main__': | |
with torch.no_grad(): | |
main(parse_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Works on Ubuntu 22.04, GTX 1060, CUDA 12.2, Nvidia driver ver 535.183.01, miniforge (mamba).
Here's how I got my environment set up:
[If you don't have conda/mamba] Install miniforge:
Restart terminal.
[If you don't have latest CUDA/graphics card drivers]:
sudo ubuntu-drivers install
Create folder, copy files in.
Create environment:
mamba install -p envs/sb3 --file environment.yml
Pass gate on HuggingFace so you can download the model:
huggingface-cli login
. Follow prompts. I gave my token only one permission: "Repos/Read access to contents of all public gated repos you can access".Run script:
python run_sb3.py "<enter your prompt here>"
.