LLM手撕

2024-09-04 00:52
文章标签 llm

本文主要是介绍LLM手撕,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!

LayerNorm

import torch
from torch import nnclass LayerNorm(nn.Module):def __init__(self, hidden_size, eps=1e-6):super().__init__()self.hidden_size = hidden_size  # 隐藏状态的大小self.eps = eps  # 用于数值稳定性的一个小值# 初始化可学习的缩放和平移参数self.gamma = nn.Parameter(torch.ones(hidden_size))  # 缩放参数,初始值为全1self.beta = nn.Parameter(torch.zeros(hidden_size))  # 平移参数,初始值为全0def forward(self, x):# x 形状: (batch_size, seq_len, hidden_size)# 计算每个样本的均值和方差mean = x.mean(dim=-1, keepdim=True)  # 计算最后一个维度的均值,形状: (batch_size, seq_len, 1)variance = x.var(dim=-1, keepdim=True, unbiased=False)  # 计算最后一个维度的方差,形状: (batch_size, seq_len, 1)# 进行归一化x_normalized = (x - mean) / torch.sqrt(variance + self.eps)  # 归一化,形状: (batch_size, seq_len, hidden_size)# 应用缩放和平移参数output = self.gamma * x_normalized + self.beta  # 形状: (batch_size, seq_len, hidden_size)return outputdef test_layer_norm():batch_size = 2seq_len = 4hidden_size = 8# 随机生成输入数据x = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)# 创建 LayerNorm 模块layer_norm = LayerNorm(hidden_size)# 计算 LayerNorm 输出output = layer_norm(x)print("Input shape:", x.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_layer_norm()

BatchNorm

import torch
from torch import nnclass BatchNorm(nn.Module):def __init__(self, hidden_size, eps=1e-5, momentum=0.1):super().__init__()self.hidden_size = hidden_size  # 隐藏状态的大小self.eps = eps  # 用于数值稳定性的一个小值self.momentum = momentum  # 用于计算运行时均值和方差的动量# 初始化可学习的缩放和平移参数self.gamma = nn.Parameter(torch.ones(hidden_size))  # 缩放参数,初始值为全1self.beta = nn.Parameter(torch.zeros(hidden_size))  # 平移参数,初始值为全0# 初始化运行时均值和方差self.running_mean = torch.zeros(hidden_size)  # 运行时均值,初始值为全0self.running_var = torch.ones(hidden_size)  # 运行时方差,初始值为全1def forward(self, x):# x 形状: (batch_size, seq_len, hidden_size)if self.training:# 计算当前批次的均值和方差batch_mean = x.mean(dim=(0, 1), keepdim=False)  # 计算前两个维度的均值,形状: (hidden_size)batch_var = x.var(dim=(0, 1), keepdim=False, unbiased=False)  # 计算前两个维度的方差,形状: (hidden_size)# 更新运行时均值和方差self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_meanself.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_varmean = batch_meanvariance = batch_varelse:# 使用运行时均值和方差mean = self.running_meanvariance = self.running_var# 进行归一化x_normalized = (x - mean) / torch.sqrt(variance + self.eps)  # 归一化,形状: (batch_size, seq_len, hidden_size)# 应用缩放和平移参数output = self.gamma * x_normalized + self.beta  # 形状: (batch_size, seq_len, hidden_size)return outputdef test_batch_norm():batch_size = 2seq_len = 4hidden_size = 8# 随机生成输入数据x = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)# 创建 BatchNorm 模块batch_norm = BatchNorm(hidden_size)# 计算 BatchNorm 输出output = batch_norm(x)print("Input shape:", x.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_batch_norm()

Dropout

import torch
from torch import nnclass Dropout(nn.Module):def __init__(self, dropout_prob=0.1):super().__init__()self.dropout_prob = dropout_prob  # Dropout 的概率def forward(self, x):if self.training:# 生成与输入形状相同的掩码,元素为 0 或 1,按照 dropout_prob 的概率为 0mask = (torch.rand(x.shape) > self.dropout_prob).float()  # 掩码,形状与 x 相同# 归一化掩码,使得训练阶段和推理阶段的一致性output = mask * x / (1.0 - self.dropout_prob)  # 形状与 x 相同else:output = x  # 推理阶段,不进行 Dropoutreturn outputdef test_dropout():batch_size = 2seq_len = 4hidden_size = 8# 随机生成输入数据x = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)# 创建 Dropout 模块dropout = Dropout(dropout_prob=0.1)# 设置为训练模式dropout.train()output_train = dropout(x)# 设置为推理模式dropout.eval()output_eval = dropout(x)print("Input shape:", x.shape)print("Output shape during training:", output_train.shape)print("Output shape during evaluation:", output_eval.shape)if __name__ == "__main__":test_dropout()

Transformer位置编码

def sinusoidal_position_embedding(batch_size, nums_head, max_len, output_dim, device):# (max_len, 1)position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(-1)# (output_dim//2)ids = torch.arange(0, output_dim // 2, dtype=torch.float)  # 即公式里的i, i的范围是 [0,d/2]theta = torch.pow(10000, -2 * ids / output_dim)# (max_len, output_dim//2)embeddings = position * theta  # 即公式里的:pos / (10000^(2i/d))# (max_len, output_dim//2, 2)embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1)# (bs, head, max_len, output_dim//2, 2)embeddings = embeddings.repeat((batch_size, nums_head, *([1] * len(embeddings.shape))))  # 在bs维度重复,其他维度都是1不重复# (bs, head, max_len, output_dim)# reshape后就是:偶数sin, 奇数cos了embeddings = torch.reshape(embeddings, (batch_size, nums_head, max_len, output_dim))embeddings = embeddings.to(device)return embeddings

RoPE

Self-attention

from math import sqrt
import torch
import torch.nn as nnclass Self_Attention(nn.Module):def __init__(self, input_dim, dim_k, dim_v):super(Self_Attention, self).__init__()self.q = nn.Linear(input_dim, dim_k)self.k = nn.Linear(input_dim, dim_k)self.v = nn.Linear(input_dim, dim_v)self._norm_fact = 1 / sqrt(dim_k)def forward(self, x):Q = self.q(x)  # Q: batch_size * seq_len * dim_kK = self.k(x)  # K: batch_size * seq_len * dim_kV = self.v(x)  # V: batch_size * seq_len * dim_v# Q * K.T() / sqrt(dim_k)atten = torch.bmm(Q, K.permute(0, 2, 1)) * self._norm_fact  # batch_size * seq_len * seq_len# 计算 Softmaxatten = torch.softmax(atten, dim=-1)# 计算输出output = torch.bmm(atten, V)  # Q * K.T() * V # batch_size * seq_len * dim_vreturn output# 创建一个 Self_Attention 对象
input_dim = 64
dim_k = 32
dim_v = 32
self_attention = Self_Attention(input_dim, dim_k, dim_v)# 创建一个示例输入张量,形状为 batch_size * seq_len * input_dim
batch_size = 2
seq_len = 10
x = torch.randn(batch_size, seq_len, input_dim)# 运行前向传播
output = self_attention(x)print("Input shape:", x.shape)
print("Output shape:", output.shape)

Scaled Cross Product

import torch
from torch import nnclass ScaledDotProductAttention(nn.Module):def __init__(self):super().__init__()def forward(self, query, key, value, attention_mask=None):# query, key, value 形状: (batch_size, seq_len, hidden_size)# 计算注意力分数# key.transpose(-1, -2) 将最后两个维度进行转置,以进行点积# attention_scores 形状: (batch_size, seq_len, seq_len)d_k = query.size(-1)  # 获取 hidden_sizeattention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))# 添加注意力掩码(seq_len, seq_len),掩码位置(1)的值为负无穷if attention_mask is not None:attention_scores += attention_mask * -1e9# 对注意力分数进行归一化,得到注意力概率attention_probs = torch.softmax(attention_scores, dim=-1)  # (batch_size, num_heads, seq_len, seq_len)# 计算注意力输出,通过注意力概率加权值attention_output = torch.matmul(attention_probs, value)  # (batch_size, num_heads, seq_len, hidden_size)return attention_outputdef test_attn():batch_size = 128seq_len = 512hidden_size = 1024query = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)key = torch.randn(batch_size, seq_len, hidden_size)    # (batch_size, seq_len, hidden_size)value = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)sdpa = ScaledDotProductAttention()output = sdpa(query, key, value)print("Query shape:", query.shape)print("Key shape:", key.shape)print("Value shape:", value.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_attn()

MHA

import torch
from torch import nnclass MultiHeadAttention(torch.nn.Module):def __init__(self, hidden_size, num_heads):super().__init__()self.num_heads = num_headsself.head_dim = hidden_size // num_heads  # 每个头的维度,二者必须整除# 初始化 Q、K、V 的投影矩阵,将输入词向量线性变换为 Q、K、V,维度保持一致self.q_linear = nn.Linear(hidden_size, hidden_size) self.k_linear = nn.Linear(hidden_size, hidden_size)self.v_linear = nn.Linear(hidden_size, hidden_size)# 输出线性层,将拼接后的多头注意力输出变换为所需的输出维度,这里维度保持一致self.o_linear = nn.Linear(hidden_size, hidden_size)def forward(self, hidden_state, attention_mask=None):# hidden_state 形状: (batch_size, seq_len, hidden_size)batch_size = hidden_state.size(0)  # 获取批量大小# 计算 Q、K、V,线性变换query = self.q_linear(hidden_state)  # (batch_size, seq_len, hidden_size)key = self.k_linear(hidden_state)    # (batch_size, seq_len, hidden_size)value = self.v_linear(hidden_state)  # (batch_size, seq_len, hidden_size)# 分割多头,将每个头的维度拆分出来query = self.split_head(query)  # (batch_size, num_heads, seq_len, head_dim)key = self.split_head(key)      # (batch_size, num_heads, seq_len, head_dim)value = self.split_head(value)  # (batch_size, num_heads, seq_len, head_dim)# 计算注意力分数,使用缩放点积注意力机制# attention_scores 形状: (batch_size, num_heads, seq_len, seq_len)attention_scores = torch.matmul(query, key.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))# 添加注意力掩码(seq_len, seq_len),掩码位置(1)的值为负无穷if attention_mask is not None:attention_scores += attention_mask * -1e9# 对注意力分数进行归一化,得到注意力概率attention_probs = torch.softmax(attention_scores, dim=-1)  # (batch_size, num_heads, seq_len, seq_len)# 计算注意力输出,通过注意力概率加权值output = torch.matmul(attention_probs, value)  # (batch_size, num_heads, seq_len, head_dim)# 对多头注意力输出进行拼接# output.transpose(1, 2) 将 num_heads 和 seq_len 维度转置# 将形状调整为 (batch_size, seq_len, hidden_size)output = output.transpose(1, 2).reshape(batch_size, -1, self.head_dim * self.num_heads)# 通过线性层将拼接后的输出变换为所需的输出维度output = self.o_linear(output)  # (batch_size, seq_len, hidden_size)return outputdef split_head(self, x):batch_size = x.size(0)  # 获取批量大小# x 形状: (batch_size, seq_len, hidden_size)# 将 hidden_size 分割为 num_heads 和 head_dimreturn x.reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)# 返回形状: (batch_size, num_heads, seq_len, head_dim)def test_MHA():batch_size = 128seq_len = 512hidden_size = 1024num_heads = 8# 随机生成输入数据hidden_state = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)# 创建多头注意力模块mha = MultiHeadAttention(hidden_size, num_heads)# 计算多头注意力输出output = mha(hidden_state)print("Input shape:", hidden_state.shape)print("Output shape:", output.shape)if __name__ == "__main__":test_MHA()

Softmax

import torchdef softmax(x):# 计算输入张量的指数exp_x = torch.exp(x)# 计算所有指数之和sum_exp_x = torch.sum(exp_x, dim=0)# 将每个元素的指数除以总和softmax_x = exp_x / sum_exp_xreturn softmax_x# 假设我们有一个张量
x = torch.tensor([1.0, 2.0, 3.0])
# 使用自己实现的 softmax 函数
softmax_x = softmax(x)
print(softmax_x)

MSE

import torchdef mse_loss(y_true, y_pred):# 计算平方误差squared_diff = (y_true - y_pred) ** 2# 返回平均平方误差return torch.mean(squared_diff)# 测试均方误差损失函数
y_true = torch.tensor([3.0, -0.5, 2.0, 7.0])
y_pred = torch.tensor([2.5, 0.0, 2.0, 8.0])loss = mse_loss(y_true, y_pred)
print(f"Mean Squared Error: {loss.item()}")

Cross entropy

import torchdef cross_entropy_loss(y_true, y_pred):# 防止 log(0) 的情况epsilon = 1e-12y_pred = torch.clamp(y_pred, epsilon, 1. - epsilon)# 计算交叉熵ce_loss = -torch.sum(y_true * torch.log(y_pred), dim=-1)# 返回平均损失return torch.mean(ce_loss)
y_true = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=torch.float32)
y_pred = torch.tensor([[0.8, 0.1, 0.1], [0.2, 0.7, 0.1]], dtype=torch.float32)loss = cross_entropy_loss(y_true, y_pred)
print(f"Cross-Entropy Loss: {loss.item()}")

这篇关于LLM手撕的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!



http://www.chinasem.cn/article/1134539

相关文章

[论文笔记]LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale

引言 今天带来第一篇量化论文LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale笔记。 为了简单,下文中以翻译的口吻记录,比如替换"作者"为"我们"。 大语言模型已被广泛采用,但推理时需要大量的GPU内存。我们开发了一种Int8矩阵乘法的过程,用于Transformer中的前馈和注意力投影层,这可以将推理所需

LLM系列 | 38:解读阿里开源语音多模态模型Qwen2-Audio

引言 模型概述 模型架构 训练方法 性能评估 实战演示 总结 引言 金山挂月窥禅径,沙鸟听经恋法门。 小伙伴们好,我是微信公众号《小窗幽记机器学习》的小编:卖铁观音的小男孩,今天这篇小作文主要是介绍阿里巴巴的语音多模态大模型Qwen2-Audio。近日,阿里巴巴Qwen团队发布了最新的大规模音频-语言模型Qwen2-Audio及其技术报告。该模型在音频理解和多模态交互

LLM应用实战: 产业治理多标签分类

数据介绍 标签体系 产业治理方面的标签体系共计200+个,每个标签共有4个层级,且第3、4层级有标签含义的概括信息。 原始数据 企业官网介绍数据,包括基本介绍、主要产品等 企业专利数据,包括专利名称和专利摘要信息,且专利的数据量大。 LLM选型 经调研,采用Qwen2-72B-Instruct-GPTQ-Int4量化版本,占用显存更少,且效果与非量化相当,

LLM大模型教程:langchain 教程

软件安装 pip install pymupdfpip install langchainpip install langchain-cliconda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl 由于langchain不支持qwen模型,我们需要自定义模型 from typing import A

LLM模型:代码讲解Transformer运行原理

视频讲解、获取源码:LLM模型:代码讲解Transformer运行原理(1)_哔哩哔哩_bilibili 1 训练保存模型文件 2 模型推理 3 推理代码 import torchimport tiktokenfrom wutenglan_model import WutenglanModelimport pyttsx3# 设置设备为CUDA(如果可用),否则使用CPU#

[论文笔记] LLM大模型剪枝篇——2、剪枝总体方案

https://github.com/sramshetty/ShortGPT/tree/main My剪枝方案(暂定):         剪枝目标:1.5B —> 100~600M         剪枝方法:                 层粒度剪枝                 1、基于BI分数选择P%的冗余层,P=60~80                 2、对前N%冗余层,

jmeter压力测试,通过LLM利用RAG实现知识库问答,NEO4J部署,GraphRAG以知识图谱在查询时增强提示实现更准确的知识库问答(9/7)

前言         这周也是杂七杂八的一天(高情商:我是一块砖,哪里需要往哪里搬),首先是接触了jemter这个压力测试工具,然后帮公司的AIGC项目编写使用手册和问答手册的第一版,并通过这个平台的智能体实现知识库问答的功能展示,以及部分个人扩展和思考(NEO4J创建知识图谱的GraphRAG)。 Jmeter         Jmeter是一个压力测试工具,一开始导师叫我熟悉的时候我还说

下载量10w+!LLM经典《大型语言模型:语言理解和生成》pdf分享

介绍 近年来,人工智能在新语言能力方面取得了显著进展,深度学习技术的快速发展推动了语言AI系统在文本编写和理解方面的表现。这一趋势催生了许多新功能、产品和整个行业的兴起。 本书旨在为Python开发者提供实用工具和概念,帮助他们利用预训练的大型语言模型的能力,如拷贝写作、摘要等用例;构建高级的LLM流水线来聚类文本文档并探索主题;创建超越关键词搜索的语义搜索引擎;深入了解基础Transfo

深入解析五大 LLM 可视化工具:Langflow、Flowise、Dify、AutoGPT UI 和 AgentGPT

近年来,大语言模型(LLM)技术的迅猛发展推动了智能代理(Agent)应用的广泛应用。从任务自动化到智能对话系统,LLM 代理可以极大简化复杂任务的执行。为了帮助开发者更快地构建和部署这些智能代理,多个开源工具应运而生,尤其是那些提供可视化界面的工具,让开发者通过简单的图形界面设计、调试和管理智能代理。 本文将详细介绍五款热门的 LLM 可视化工具,分别是 Langflow、Flowise、Di

基于LangChain+LLM的相关技术研究及初步实践

01 概述 大模型概述 大模型是指具有大规模参数和复杂计算结构的机器学习模型。这些模型通常由深度神经网络构建而成,拥有数十亿甚至数千亿个参数。大模型的设计目的是为了提高模型的表达能力和预测性能,能够处理更加复杂的任务和数据。大模型在各种领域都有广泛的应用,包括自然语言处理、计算机视觉、语音识别和推荐系统等。大模型通过训练海量数据来学习复杂的模式和特征,具有更强大的泛化能力,可以对未见过的数据