本文主要是介绍用 C 语言进行大模型推理:探索 llama2.c 仓库(二),希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
- 前提
- 如何构建一个Transformer Model
- 模型定义
- 模型初始化
- 如何构建tokenzier 和 sampler
- 如何进行推理
- 总结
上一节我们介绍了llama2.c中如何对hugging face的权重进行处理,拿到了llama2.c想要的权重格式和tokenizer.bin格式。这一节我们分析下在llama2.c如何解析这两个.bin
用 C 语言进行大模型推理:探索 llama2.c 仓库(一)
如何构建一个Transformer Model
按照一个最简单地理解,我们可以使用C语言构建一个Transformer Model,然后将两个.bin文件按照格式填进去即可。那这个Transformer Model 应该是一个什么数据结构呢,或者是一个什么样的组织架构呢?在C语言中没有class
这个概念的,最多我们常见的也就是结构体了,而且结构体里只能定义变量,不能定义函数。所以那些操作Transformer Model中的那些算子又该如何实现呢?带着这些问题,或者你还有其他的问题,我们一步一步来看下llama2.c中是如何实现的。
typedef struct {int dim; // transformer dimensionint hidden_dim; // for ffn layersint n_layers; // number of layersint n_heads; // number of query headsint n_kv_heads; // number of key/value heads (can be < query heads because of// multiquery)int vocab_size; // vocabulary size, usually 256 (byte-level)int seq_len; // max sequence length
} Config;typedef struct {// token embedding tablefloat *token_embedding_table; // (vocab_size, dim)// weights for rmsnormsfloat *rms_att_weight; // (layer, dim) rmsnorm weightsfloat *rms_ffn_weight; // (layer, dim)// weights for matmuls. note dim == n_heads * head_sizefloat *wq; // (layer, dim, n_heads * head_size)float *wk; // (layer, dim, n_kv_heads * head_size)float *wv; // (layer, dim, n_kv_heads * head_size)float *wo; // (layer, n_heads * head_size, dim)// weights for ffnfloat *w1; // (layer, hidden_dim, dim)float *w2; // (layer, dim, hidden_dim)float *w3; // (layer, hidden_dim, dim)// final rmsnormfloat *rms_final_weight; // (dim,)// (optional) classifier weights for the logits, on the last layerfloat *wcls;
} TransformerWeights;typedef struct {// current wave of activationsfloat *x; // activation at current time stamp (dim,)float *xb; // same, but inside a residual branch (dim,)float *xb2; // an additional buffer just for convenience (dim,)float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)float *q; // query (dim,)float *k; // key (dim,)float *v; // value (dim,)float *att; // buffer for scores/attention values (n_heads, seq_len)float *logits; // output logits// kv cachefloat *key_cache; // (layer, seq_len, dim)float *value_cache; // (layer, seq_len, dim)
} RunState;typedef struct {Config config; // the hyperparameters of the architecture (the blueprint)TransformerWeights weights; // the weights of the modelRunState state; // buffers for the "wave" of activations in the forward pass// some more state needed to properly clean up the memory mapping (sigh)int fd; // file descriptor for memory mappingfloat *data; // memory mapped data pointerssize_t file_size; // size of the checkpoint file in bytes
} Transformer;
void memory_map_weights(TransformerWeights *w, Config *p, float *ptr,int shared_weights) {int head_size = p->dim / p->n_heads;// make sure the multiplications below are done in 64bit to fit the parameter// counts of 13B+ modelsunsigned long long n_layers = p->n_layers;w->token_embedding_table = ptr;ptr += p->vocab_size * p->dim;w->rms_att_weight = ptr;ptr += n_layers * p->dim;w->wq = ptr;ptr += n_layers * p->dim * (p->n_heads * head_size);w->wk = ptr;ptr += n_layers * p->dim * (p->n_kv_heads * head_size);w->wv = ptr;ptr += n_layers * p->dim * (p->n_kv_heads * head_size);w->wo = ptr;ptr += n_layers * (p->n_heads * head_size) * p->dim;w->rms_ffn_weight = ptr;ptr += n_layers * p->dim;w->w1 = ptr;ptr += n_layers * p->dim * p->hidden_dim;w->w2 = ptr;ptr += n_layers * p->hidden_dim * p->dim;w->w3 = ptr;ptr += n_layers * p->dim * p->hidden_dim;w->rms_final_weight = ptr;ptr += p->dim;ptr += p->seq_len * head_size /2; // skip what used to be freq_cis_real (for RoPE)ptr += p->seq_len * head_size /2; // skip what used to be freq_cis_imag (for RoPE)w->wcls = shared_weights ? w->token_embedding_table : ptr;
void malloc_run_state(RunState *s, Config *p) {// we calloc instead of malloc to keep valgrind happyint kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;s->x = calloc(p->dim, sizeof(float));s->xb = calloc(p->dim, sizeof(float));s->xb2 = calloc(p->dim, sizeof(float));s->hb = calloc(p->hidden_dim, sizeof(float));s->hb2 = calloc(p->hidden_dim, sizeof(float));s->q = calloc(p->dim, sizeof(float));s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));s->att = calloc(p->n_heads * p->seq_len, sizeof(float));s->logits = calloc(p->vocab_size, sizeof(float));// ensure all mallocs went fineif (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q ||!s->key_cache || !s->value_cache || !s->att || !s->logits) {fprintf(stderr, "malloc failed!\n");exit(EXIT_FAILURE);}
如何构建tokenzier 和 sampler
// forward all the layersfor (unsigned long long l = 0; l < p->n_layers; l++) {// attention rmsnormrmsnorm(s->xb, x, w->rms_att_weight + l * dim, dim);// key and value point to the kv cacheint loff = l * p->seq_len * kv_dim; // kv cache layer offset for conveniences->k = s->key_cache + loff + pos * kv_dim;s->v = s->value_cache + loff + pos * kv_dim;// qkv matmuls for this positionmatmul(s->q, s->xb, w->wq + l * dim * dim, dim, dim);matmul(s->k, s->xb, w->wk + l * dim * kv_dim, dim, kv_dim);matmul(s->v, s->xb, w->wv + l * dim * kv_dim, dim, kv_dim);// RoPE relative positional encoding: complex-valued rotate q and k in each// headfor (int i = 0; i < dim; i += 2) {int head_dim = i % head_size;float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);float val = pos * freq;float fcr = cosf(val);float fci = sinf(val);int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q onlyfor (int v = 0; v < rotn; v++) {float *vec =v == 0 ? s->q : s->k; // the vector to rotate (query or key)float v0 = vec[i];float v1 = vec[i + 1];vec[i] = v0 * fcr - v1 * fci;vec[i + 1] = v0 * fci + v1 * fcr;}}// multihead attention. iterate over all headsint h;
#pragma omp parallel for private(h)for (h = 0; h < p->n_heads; h++) {// get the query vector for this headfloat *q = s->q + h * head_size;// attention scores for this headfloat *att = s->att + h * p->seq_len;// iterate over all timesteps, including the current onefor (int t = 0; t <= pos; t++) {// get the key vector for this head and at this timestepfloat *k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// calculate the attention score as the dot product of q and kfloat score = 0.0f;for (int i = 0; i < head_size; i++) {score += q[i] * k[i];}score /= sqrtf(head_size);// save the score to the attention bufferatt[t] = score;}// softmax the scores to get attention weights, from 0..pos inclusivelysoftmax(att, pos + 1);// weighted sum of the values, store back into xbfloat *xb = s->xb + h * head_size;memset(xb, 0, head_size * sizeof(float));for (int t = 0; t <= pos; t++) {// get the value vector for this head and at this timestepfloat *v =s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;// get the attention weight for this timestepfloat a = att[t];// accumulate the weighted value into xbfor (int i = 0; i < head_size; i++) {xb[i] += a * v[i];}}}// final matmul to get the output of the attentionmatmul(s->xb2, s->xb, w->wo + l * dim * dim, dim, dim);// residual connection back into xfor (int i = 0; i < dim; i++) {x[i] += s->xb2[i];}// ffn rmsnormrmsnorm(s->xb, x, w->rms_ffn_weight + l * dim, dim);// Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))// first calculate self.w1(x) and self.w3(x)matmul(s->hb, s->xb, w->w1 + l * dim * hidden_dim, dim, hidden_dim);matmul(s->hb2, s->xb, w->w3 + l * dim * hidden_dim, dim, hidden_dim);// SwiGLU non-linearityfor (int i = 0; i < hidden_dim; i++) {float val = s->hb[i];// silu(x)=x*σ(x), where σ(x) is the logistic sigmoidval *= (1.0f / (1.0f + expf(-val)));// elementwise multiply with w3(x)val *= s->hb2[i];s->hb[i] = val;}// final matmul to get the output of the ffnmatmul(s->xb, s->hb, w->w2 + l * dim * hidden_dim, hidden_dim, dim);// residual connectionfor (int i = 0; i < dim; i++) {x[i] += s->xb[i];}}
这篇关于用 C 语言进行大模型推理:探索 llama2.c 仓库(二)的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!