Sora底层使用了DIT架构，也就是Diffusion Transformer

本文主要是介绍Sora底层使用了DIT架构，也就是Diffusion Transformer，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

Sora底层使用了DIT架构，也就是Diffusion Transformer，该架构采用了扩散模型和Transformer相结合，由facebook开源。本视频是对论文、源码和项目的解析。

一、预测的总体架构

"""
Sample new images from a pre-trained DiT.
"""
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
from torchvision.utils import save_image
from diffusion import create_diffusion
from diffusers.models import AutoencoderKL
from download import find_model
from models import DiT_models
import argparsedef main(args):# Setup PyTorch:torch.manual_seed(args.seed)torch.set_grad_enabled(False)device = "cuda" if torch.cuda.is_available() else "cpu"if args.ckpt is None:assert args.model == "DiT-XL/2", "Only DiT-XL/2 models are available for auto-download."assert args.image_size in [256, 512]assert args.num_classes == 1000# Load model:latent_size = args.image_size // 8model = DiT_models[args.model](input_size=latent_size,num_classes=args.num_classes).to(device)# Auto-download a pre-trained model or load a custom DiT checkpoint from train.py:ckpt_path = args.ckpt or f"DiT-XL-2-{args.image_size}x{args.image_size}.pt"state_dict = find_model(ckpt_path)model.load_state_dict(state_dict)model.eval()  # important!diffusion = create_diffusion(str(args.num_sampling_steps))vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)# Labels to condition the model with (feel free to change):class_labels = [207, 360, 387, 974, 88, 979, 417, 279]# Create sampling noise:n = len(class_labels)z = torch.randn(n, 4, latent_size, latent_size, device=device)y = torch.tensor(class_labels, device=device)# Setup classifier-free guidance:z = torch.cat([z, z], 0)y_null = torch.tensor([1000] * n, device=device)y = torch.cat([y, y_null], 0)model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)# Sample images:samples = diffusion.p_sample_loop(model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device)samples, _ = samples.chunk(2, dim=0)  # Remove null class samplessamples = vae.decode(samples / 0.18215).sample# Save and display images:save_image(samples, "sample.png", nrow=4, normalize=True, value_range=(-1, 1))

二、DIT的总体架构


class DiT(nn.Module):"""Diffusion model with a Transformer backbone."""def __init__(self,input_size=32,patch_size=2,in_channels=4,hidden_size=1152,depth=28,num_heads=16,mlp_ratio=4.0,class_dropout_prob=0.1,num_classes=1000,learn_sigma=True,):super().__init__()self.learn_sigma = learn_sigmaself.in_channels = in_channelsself.out_channels = in_channels * 2 if learn_sigma else in_channelsself.patch_size = patch_sizeself.num_heads = num_headsself.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)self.t_embedder = TimestepEmbedder(hidden_size)self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)num_patches = self.x_embedder.num_patches# Will use fixed sin-cos embedding:self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)self.blocks = nn.ModuleList([DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)])self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)self.initialize_weights()def initialize_weights(self):# Initialize transformer layers:def _basic_init(module):if isinstance(module, nn.Linear):torch.nn.init.xavier_uniform_(module.weight)if module.bias is not None:nn.init.constant_(module.bias, 0)self.apply(_basic_init)# Initialize (and freeze) pos_embed by sin-cos embedding:pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):w = self.x_embedder.proj.weight.datann.init.xavier_uniform_(w.view([w.shape[0], -1]))nn.init.constant_(self.x_embedder.proj.bias, 0)# Initialize label embedding table:nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)# Initialize timestep embedding MLP:nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)# Zero-out adaLN modulation layers in DiT blocks:for block in self.blocks:nn.init.constant_(block.adaLN_modulation[-1].weight, 0)nn.init.constant_(block.adaLN_modulation[-1].bias, 0)# Zero-out output layers:nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)nn.init.constant_(self.final_layer.linear.weight, 0)nn.init.constant_(self.final_layer.linear.bias, 0)def unpatchify(self, x):"""x: (N, T, patch_size**2 * C)imgs: (N, H, W, C)"""c = self.out_channelsp = self.x_embedder.patch_size[0]h = w = int(x.shape[1] ** 0.5)assert h * w == x.shape[1]x = x.reshape(shape=(x.shape[0], h, w, p, p, c))x = torch.einsum('nhwpqc->nchpwq', x)imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))return imgsdef forward(self, x, t, y):"""Forward pass of DiT.x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)t: (N,) tensor of diffusion timestepsy: (N,) tensor of class labels"""x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2t = self.t_embedder(t)                   # (N, D)y = self.y_embedder(y, self.training)    # (N, D)c = t + y                                # (N, D)for block in self.blocks:x = block(x, c)                      # (N, T, D)x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)x = self.unpatchify(x)                   # (N, out_channels, H, W)return x

三、DDIM快速采样的预测代码

    def ddim_sample(self,model,x,t,clip_denoised=True,denoised_fn=None,cond_fn=None,model_kwargs=None,eta=0.0,):"""Sample x_{t-1} from the model using DDIM.Same usage as p_sample()."""out = self.p_mean_variance(model,x,t,clip_denoised=clip_denoised,denoised_fn=denoised_fn,model_kwargs=model_kwargs,)if cond_fn is not None:out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)# Usually our model outputs epsilon, but we re-derive it# in case we used x_start or x_prev prediction.eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)sigma = (eta* th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))* th.sqrt(1 - alpha_bar / alpha_bar_prev))# Equation 12.noise = th.randn_like(x)mean_pred = (out["pred_xstart"] * th.sqrt(alpha_bar_prev)+ th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps)nonzero_mask = ((t != 0).float().view(-1, *([1] * (len(x.shape) - 1))))  # no noise when t == 0sample = mean_pred + nonzero_mask * sigma * noisereturn {"sample": sample, "pred_xstart": out["pred_xstart"]}

这是DIT的核心的代码和整体的逻辑架构

这篇关于Sora底层使用了DIT架构，也就是Diffusion Transformer的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！