#@title **Setup**
!pip install transformers diffusers accelerate| grep -v -e 'already satisfied' -e 'Downloading'
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting transformers ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.0/7.0 MB 39.1 MB/s eta 0:00:00 Collecting diffusers ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 852.0/852.0 kB 27.4 MB/s eta 0:00:00 Collecting accelerate ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 215.3/215.3 kB 9.3 MB/s eta 0:00:00 Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 28.5 MB/s eta 0:00:00 Collecting huggingface-hub<1.0,>=0.11.0 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 200.1/200.1 kB 11.4 MB/s eta 0:00:00 Installing collected packages: tokenizers, huggingface-hub, transformers, diffusers, accelerate Successfully installed accelerate-0.18.0 diffusers-0.15.1 huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
A key component of Stable Diffusion is embedding alignment between related text and images.
Demo is adapted from this documentation
#@title Import libraries
import os
import torch
import requests
from diffusers import DiffusionPipeline, StableDiffusionImg2ImgPipeline
from io import BytesIO
from PIL import Image
device = "cuda"
generator = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
).to(device)
Downloading (…)ain/model_index.json: 0%| | 0.00/539 [00:00<?, ?B/s]
Fetching 13 files: 0%| | 0/13 [00:00<?, ?it/s]
Downloading pytorch_model.bin: 0%| | 0.00/1.36G [00:00<?, ?B/s]
Downloading (…)cheduler_config.json: 0%| | 0.00/345 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/460 [00:00<?, ?B/s]
Downloading (…)_encoder/config.json: 0%| | 0.00/633 [00:00<?, ?B/s]
Downloading (…)tokenizer/vocab.json: 0%| | 0.00/1.06M [00:00<?, ?B/s]
Downloading (…)rocessor_config.json: 0%| | 0.00/342 [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/824 [00:00<?, ?B/s]
Downloading (…)tokenizer/merges.txt: 0%| | 0.00/525k [00:00<?, ?B/s]
Downloading (…)da1/unet/config.json: 0%| | 0.00/939 [00:00<?, ?B/s]
Downloading (…)on_pytorch_model.bin: 0%| | 0.00/335M [00:00<?, ?B/s]
Downloading (…)on_pytorch_model.bin: 0%| | 0.00/3.46G [00:00<?, ?B/s]
Downloading (…)3da1/vae/config.json: 0%| | 0.00/611 [00:00<?, ?B/s]
/usr/local/lib/python3.9/dist-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. warnings.warn(
prompt = "An image of a squirrel in Monet style"
prompt2 = "a black and white cartoon of friendly monster eating ice cream in the style of shell silverstein"
Parameters to tweak are:
prompt text: let's get creative
num_inference_steps: the more steps the clearer the final image but the longer the process
Experiments:
image = generator(prompt, num_inference_steps=50).images[0] # vary num_inference_steps between 2-100
# Save your image by uncommenting below
#image.save("image_of_squirrel_painting.png")
image.thumbnail((384, 384))
image
0%| | 0/50 [00:00<?, ?it/s]
image = generator(prompt2).images[0]
image.thumbnail((384, 384))
image
0%| | 0/50 [00:00<?, ?it/s]
Demo is adapted from this documentation
# Load the pipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16).to(
'cuda'
)
Downloading (…)ain/model_index.json: 0%| | 0.00/539 [00:00<?, ?B/s]
Fetching 13 files: 0%| | 0/13 [00:00<?, ?it/s]
Downloading (…)cheduler_config.json: 0%| | 0.00/345 [00:00<?, ?B/s]
Downloading (…)_encoder/config.json: 0%| | 0.00/633 [00:00<?, ?B/s]
Downloading (…)tokenizer/merges.txt: 0%| | 0.00/525k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/1.36G [00:00<?, ?B/s]
Downloading (…)rocessor_config.json: 0%| | 0.00/342 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/460 [00:00<?, ?B/s]
Downloading (…)tokenizer/vocab.json: 0%| | 0.00/1.06M [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/824 [00:00<?, ?B/s]
Downloading (…)da1/unet/config.json: 0%| | 0.00/939 [00:00<?, ?B/s]
Downloading (…)on_pytorch_model.bin: 0%| | 0.00/3.46G [00:00<?, ?B/s]
Downloading (…)3da1/vae/config.json: 0%| | 0.00/611 [00:00<?, ?B/s]
Downloading (…)on_pytorch_model.bin: 0%| | 0.00/335M [00:00<?, ?B/s]
/usr/local/lib/python3.9/dist-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. warnings.warn(
# Let's download an initial image. Use the cell below if you have your own image
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
If you have your own image that you would like to use, please upload into the sample_data folder provided by Colab.
# Check that your image has been uploaded correctly
os.listdir('./sample_data')
# Uncomment below to read image
#init_image = Image.open('./sample_data/<your image name>')
#init_image.thumbnail((768, 768))
['README.md', 'anscombe.json', 'mnist_train_small.csv', 'mnist_test.csv', 'california_housing_train.csv', 'california_housing_test.csv']
init_image
prompt = "ghibli style, a fantasy landscape with castles"
Parameters to tweak are:
prompt text: Try different styles as well as scenes
num_inference_steps: the more steps the clearer the final image
strength: the strength of the noise we introduce (0 reproduces the original image and 1 allows most freedom)
guidance_scale: how important the text prompt is
Experiments:
images = pipe(
prompt=prompt, # try changing prompt text
image=init_image,
strength=.7, # vary between 0.1-1.0
guidance_scale=7, # vary between 1-100
num_inference_steps=50
).images
#images[0].save("fantasy_landscape.png")
images[0]
0%| | 0/35 [00:00<?, ?it/s]
What are some differences between the outputs of DALL-E 2 (or Midjourney or online stable diffusion tools) and the stable diffusion output in our notebook? What are some similarities?