This notebook demonstrates the basic usage of the Huggingface API for text-to-image synthesis using the stable diffusion xl 1.0 model.
Mostly though, it's just me having some fun with generating synthetic images.
import torch as th
import torchinfo
import diffusers
pipeline = diffusers.AutoPipelineForText2Image.from_pretrained(
'stabilityai/stable-diffusion-xl-base-1.0',
torch_dtype=th.float16,
variant='fp16',
use_safetensors=True,
)
Loading pipeline components...: 0%| | 0/7 [00:00<?, ?it/s]
#pipeline.unet
torchinfo.summary(pipeline.unet, depth=3)
========================================================================================================= Layer (type:depth-idx) Param # ========================================================================================================= UNet2DConditionModel -- ├─Conv2d: 1-1 11,840 ├─Timesteps: 1-2 -- ├─TimestepEmbedding: 1-3 -- │ └─LoRACompatibleLinear: 2-1 410,880 │ └─SiLU: 2-2 -- │ └─LoRACompatibleLinear: 2-3 1,639,680 ├─Timesteps: 1-4 -- ├─TimestepEmbedding: 1-5 -- │ └─LoRACompatibleLinear: 2-4 3,605,760 │ └─SiLU: 2-5 -- │ └─LoRACompatibleLinear: 2-6 1,639,680 ├─ModuleList: 1-6 -- │ └─DownBlock2D: 2-7 -- │ │ └─ModuleList: 3-1 4,510,080 │ │ └─ModuleList: 3-2 921,920 │ └─CrossAttnDownBlock2D: 2-8 -- │ │ └─ModuleList: 3-3 41,664,000 │ │ └─ModuleList: 3-4 14,754,560 │ │ └─ModuleList: 3-5 3,687,040 │ └─CrossAttnDownBlock2D: 2-9 -- │ │ └─ModuleList: 3-6 701,680,640 │ │ └─ModuleList: 3-7 55,723,520 ├─ModuleList: 1-7 -- │ └─CrossAttnUpBlock2D: 2-10 -- │ │ └─ModuleList: 3-8 1,052,520,960 │ │ └─ModuleList: 3-9 139,301,120 │ │ └─ModuleList: 3-10 14,746,880 │ └─CrossAttnUpBlock2D: 2-11 -- │ │ └─ModuleList: 3-11 62,496,000 │ │ └─ModuleList: 3-12 40,160,640 │ │ └─ModuleList: 3-13 3,687,040 │ └─UpBlock2D: 2-12 -- │ │ └─ModuleList: 3-14 11,171,840 ├─UNetMidBlock2DCrossAttn: 1-8 -- │ └─ModuleList: 2-13 -- │ │ └─Transformer2DModel: 3-15 350,840,320 │ └─ModuleList: 2-14 -- │ │ └─ResnetBlock2D: 3-16 31,138,560 │ │ └─ResnetBlock2D: 3-17 31,138,560 ├─GroupNorm: 1-9 640 ├─SiLU: 1-10 -- ├─Conv2d: 1-11 11,524 ========================================================================================================= Total params: 2,567,463,684 Trainable params: 2,567,463,684 Non-trainable params: 0 =========================================================================================================
pipeline.to(1);
result = pipeline("Water droplet high speed photograph.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A labradoodle puppy running through a sprinkler')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A hand with fingers.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('Computer code on a VT100 terminal.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A LAN party in the eighties.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('If stable diffusion had a face.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('An evil dangerous skynet robot zombie terminator.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A man rolling in the mud laughing.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A cow wearing a viking helmet.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("An alien octopus with two eyes.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A selfie in space.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("Drinking cofee at starbucks.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("Van Gogh Painting of a water buffalo.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A psychedelic cat.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A unicorn on the moon.", prompt_2="A unicorn with the earth in the sky.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A mountain landscape with the sun rising.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("Sunrise on the cold planet Neptune.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A tornado in the Nebraska Sandhills.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A hampster shoveling snow.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A cat as a soldier.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("An F16 fighter jet.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("A cutaway diagram of a Saturn V rocket.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("Schematic diagram of an electro-transcombobulator.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('Diagram of the human circulatory system.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline("Superbowl half-time show.")
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]
result = pipeline('A huge firework show.')
result.images[0]
0%| | 0/50 [00:00<?, ?it/s]