Open AI Glide: Text-to image Generation Explained with code 따라해보기 2 (inpaint.ipynb)
지난번에는
glide-text2im/text2im.ipynb at main · openai/glide-text2im · GitHub
GitHub - openai/glide-text2im: GLIDE: a diffusion-based text-conditional image synthesis model
GLIDE: a diffusion-based text-conditional image synthesis model - GitHub - openai/glide-text2im: GLIDE: a diffusion-based text-conditional image synthesis model
github.com
를 따라했고 이번에는
glide-text2im/inpaint.ipynb at main · openai/glide-text2im · GitHub
GitHub - openai/glide-text2im: GLIDE: a diffusion-based text-conditional image synthesis model
GLIDE: a diffusion-based text-conditional image synthesis model - GitHub - openai/glide-text2im: GLIDE: a diffusion-based text-conditional image synthesis model
github.com
클릭하면 주소 다름
notebooks 폴더에 text2im.ipynb와 inpaint.ipynb 이렇게 두개가 비슷하지만 다르게 분류되어 있길래
뭐가 다른지 분석하고 실행해봄
일단 내가 수정하고 성공한 코드를 전체 복붙하겠음
# Run this line in Colab to install the package if it is
# not already installed.
!pip install git+https://github.com/openai/glide-text2im
from typing import Tuple
from IPython.display import display
from PIL import Image
import numpy as np
import torch as th
import torch.nn.functional as F
from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (
create_model_and_diffusion,
model_and_diffusion_defaults,
model_and_diffusion_defaults_upsampler
)
# This notebook supports both CPU and GPU.
# On CPU, generating one sample may take on the order of 20 minutes.
# On a GPU, it should be under a minute.
has_cuda = th.cuda.is_available()
device = th.device('cpu' if not has_cuda else 'cuda')
# Create base model.
options = model_and_diffusion_defaults()
options['inpaint'] = True
options['use_fp16'] = has_cuda
options['timestep_respacing'] = '100' # use 100 diffusion steps for fast sampling
model, diffusion = create_model_and_diffusion(**options)
model.eval()
if has_cuda:
model.convert_to_fp16()
model.to(device)
model.load_state_dict(load_checkpoint('base-inpaint', device))
print('total base parameters', sum(x.numel() for x in model.parameters()))
# Create upsampler model.
options_up = model_and_diffusion_defaults_upsampler()
options_up['inpaint'] = True
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
model_up, diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()
if has_cuda:
model_up.convert_to_fp16()
model_up.to(device)
model_up.load_state_dict(load_checkpoint('upsample-inpaint', device))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
def show_images(batch: th.Tensor):
""" Display a batch of images inline. """
scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
display(Image.fromarray(reshaped.numpy()))
def read_image(path: str, size: int = 256) -> Tuple[th.Tensor, th.Tensor]:
pil_img = Image.open(path).convert('RGB')
pil_img = pil_img.resize((size, size), resample=Image.BICUBIC)
img = np.array(pil_img)
return th.from_numpy(img)[None].permute(0, 3, 1, 2).float() / 127.5 - 1
# Sampling parameters
prompt = "red jeans"
batch_size = 1
guidance_scale = 5.0
# Tune this parameter to control the sharpness of 256x256 images.
# A value of 1.0 is sharper, but sometimes results in grainy artifacts.
upsample_temp = 0.997
##############################
# Sample from the base model #
##############################
# Create the text tokens to feed to the model.
tokens = model.tokenizer.encode(prompt)
tokens, mask = model.tokenizer.padded_tokens_and_mask(
tokens, options['text_ctx']
)
# Create the classifier-free guidance tokens (empty)
full_batch_size = batch_size * 2
uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
[], options['text_ctx']
)
# Pack the tokens together into model kwargs.
model_kwargs = dict(
tokens=th.tensor(
[tokens] * batch_size + [uncond_tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size + [uncond_mask] * batch_size,
dtype=th.bool,
device=device,
),
)
# Create a classifier-free guidance sampling function
def model_fn(x_t, ts, **kwargs):
half = x_t[: len(x_t) // 2]
combined = th.cat([half, half], dim=0)
model_out = model(combined, ts, **kwargs)
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
eps = th.cat([half_eps, half_eps], dim=0)
return th.cat([eps, rest], dim=1)
# Sample from the base model.
model.del_cache()
samples = diffusion.p_sample_loop(
model_fn,
(full_batch_size, 3, options["image_size"], options["image_size"]),
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
)[:batch_size]
model.del_cache()
# Show the output
show_images(samples)
##############################
# Upsample the 64x64 samples #
##############################
tokens = model_up.tokenizer.encode(prompt)
tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
tokens, options_up['text_ctx']
)
# Create the model conditioning dict.
model_kwargs = dict(
# Low-res image to upsample.
low_res=((samples+1)*127.5).round()/127.5 - 1,
# Text tokens
tokens=th.tensor(
[tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size,
dtype=th.bool,
device=device,
),
)
# Sample from the base model.
model_up.del_cache()
up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
up_samples = diffusion_up.ddim_sample_loop(
model_up,
up_shape,
noise=th.randn(up_shape, device=device) * upsample_temp,
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
)[:batch_size]
model_up.del_cache()
# Show the output
show_images(up_samples)
최종 실행 성공한 코드는 이렇게 됨
위에 코드는 내가 수정해서 성공한 코드인데 원래 코드에서 고친점을 보면
# Sampling parameters
prompt = "a corgi in a field"
batch_size = 1
guidance_scale = 5.0
# Tune this parameter to control the sharpness of 256x256 images.
# A value of 1.0 is sharper, but sometimes results in grainy artifacts.
upsample_temp = 0.997
# Source image we are inpainting
source_image_256 = read_image('grass.png', size=256)
source_image_64 = read_image('grass.png', size=64)
# The mask should always be a boolean 64x64 mask, and then we
# can upsample it for the second stage.
source_mask_64 = th.ones_like(source_image_64)[:, :1]
source_mask_64[:, :, 20:] = 0
source_mask_256 = F.interpolate(source_mask_64, (256, 256), mode='nearest')
# Visualize the image we are inpainting
show_images(source_image_256 * source_mask_256)
*원래 코드*
# Sampling parameters
prompt = "red jeans"
batch_size = 1
guidance_scale = 5.0
# Tune this parameter to control the sharpness of 256x256 images.
# A value of 1.0 is sharper, but sometimes results in grainy artifacts.
upsample_temp = 0.997
*내가 실행한 코드*
원래 코드 부분에서 upsample 아래 부분을 다 지웠다
source_image 관련 코드가 전부 존재하지 않는다고 에러가 떴음
찾아보니 위에서 따로 지정해주지 않은듯 원래 존재하는 함수인가..? 찾아보니 아닌듯
grass.png는 notebooks 파일에 따로 있긴하던데 계속 이미지가 없다고 뜸
저 부분을 다 지워주고 실행했더니 잘됨
사실 이건 text2im.ipynb 에도 이렇게 돼있음
쓰다보니까... 대체 뭐가다른건지 모르겠어서 현타옴
다른점을 분석하려고 햇지만 분석하지 못함
ㅋㅋ
하지만 미래의 내가 할 수 있으니까!
다시 해보자 ㅎㅎ
##############################
# Sample from the base model #
##############################
# Create the text tokens to feed to the model.
tokens = model.tokenizer.encode(prompt)
tokens, mask = model.tokenizer.padded_tokens_and_mask(
tokens, options['text_ctx']
)
# Create the classifier-free guidance tokens (empty)
full_batch_size = batch_size * 2
uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
[], options['text_ctx']
)
# Pack the tokens together into model kwargs.
model_kwargs = dict(
tokens=th.tensor(
[tokens] * batch_size + [uncond_tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size + [uncond_mask] * batch_size,
dtype=th.bool,
device=device,
),
# Masked inpainting image
inpaint_image=(source_image_64 * source_mask_64).repeat(full_batch_size, 1, 1, 1).to(device),
inpaint_mask=source_mask_64.repeat(full_batch_size, 1, 1, 1).to(device),
)
# Create an classifier-free guidance sampling function
def model_fn(x_t, ts, **kwargs):
half = x_t[: len(x_t) // 2]
combined = th.cat([half, half], dim=0)
model_out = model(combined, ts, **kwargs)
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
eps = th.cat([half_eps, half_eps], dim=0)
return th.cat([eps, rest], dim=1)
def denoised_fn(x_start):
# Force the model to have the exact right x_start predictions
# for the part of the image which is known.
return (
x_start * (1 - model_kwargs['inpaint_mask'])
+ model_kwargs['inpaint_image'] * model_kwargs['inpaint_mask']
)
# Sample from the base model.
model.del_cache()
samples = diffusion.p_sample_loop(
model_fn,
(full_batch_size, 3, options["image_size"], options["image_size"]),
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
denoised_fn=denoised_fn,
)[:batch_size]
model.del_cache()
# Show the output
show_images(samples)
*원래 코드*
##############################
# Sample from the base model #
##############################
# Create the text tokens to feed to the model.
tokens = model.tokenizer.encode(prompt)
tokens, mask = model.tokenizer.padded_tokens_and_mask(
tokens, options['text_ctx']
)
# Create the classifier-free guidance tokens (empty)
full_batch_size = batch_size * 2
uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
[], options['text_ctx']
)
# Pack the tokens together into model kwargs.
model_kwargs = dict(
tokens=th.tensor(
[tokens] * batch_size + [uncond_tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size + [uncond_mask] * batch_size,
dtype=th.bool,
device=device,
),
)
# Create a classifier-free guidance sampling function
def model_fn(x_t, ts, **kwargs):
half = x_t[: len(x_t) // 2]
combined = th.cat([half, half], dim=0)
model_out = model(combined, ts, **kwargs)
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
eps = th.cat([half_eps, half_eps], dim=0)
return th.cat([eps, rest], dim=1)
# Sample from the base model.
model.del_cache()
samples = diffusion.p_sample_loop(
model_fn,
(full_batch_size, 3, options["image_size"], options["image_size"]),
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
)[:batch_size]
model.del_cache()
# Show the output
show_images(samples)
*내가 실행한 코드*
눈치채셨나요?
네 text2im에서 쌔벼온 코드입니다
원래 기록된 코드를 따라하면
def model_fn(x_t, ts, **kwargs):
이부분에서 invalid하다고 나옴
그래서 그냥 text2im코드 쌔벼옴
같은 이유로 아래 64*64 코드도 쌔벼옴
대체 뭐가 다를까..?
둘의 다른점은
# Masked inpainting image
inpaint_image=(source_image_64 * source_mask_64).repeat(full_batch_size, 1, 1, 1).to(device),
inpaint_mask=source_mask_64.repeat(full_batch_size, 1, 1, 1).to(device),
이게 있냐 없냐임
결국에는 source_image source_mask 코드가 문젠거 같은데
위에 sampling parameter 부분에서 코드가 성공했다면 이것도 성공했을까요~?
모릅니다
안해봤으니까요
+ inpaint와 그냥 text2im과의 차이를 찾다가
OpenAI의 GLIDE를 사용하여 텍스트 프롬프트에서 사실적인 이미지 생성 및 편집 (paperspace.com)
Generating and editing photorealistic images from text-prompts using OpenAI's GLIDE
Follow this tutorial to learn how GLIDE works and see how to implement it in a Gradient Notebook
blog.paperspace.com
ㅋㅋ 걍 설명만 잘돼있고
read_image가 무슨 함수길래 계속 실패하는거지
정해진 함수도 아닌듯 cv관련 내용도 있던데 뭔말인지 잘 모르겟음 돌겟네
read_img를 불러오는 또 다른 라이브러리를 임포트 해야되는건가..? 잘안나와있어 하