NVME Doorbell 寄存器 数据请求时doorbell 处理

2024-05-08 12:12

本文主要是介绍NVME Doorbell 寄存器 数据请求时doorbell 处理,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!

3.NVMe寄存器配置
3.1 寄存器定义
NVMe寄存器主要分为两部分,一部分定义了Controller整体属性,一部分用来存放每组队列的头尾DB寄存器。

CAP——控制器能力,定义了内存页大小的最大最小值、支持的I/O指令集、DB寄存器步长、等待时间界限、仲裁机制、队列是否物理上连续、队列大小;
VS——版本号,定义了控制器实现NVMe协议的版本号;
INTMS——中断掩码,每个bit对应一个中断向量,使用MSI-X中断时,此寄存器无效;
INTMC——中断有效,每个bit对应一个中断向量,使用MSI-X中断时,此寄存器无效;
CC——控制器配置,定义了I/O SQ和CQ队列元素大小、关机状态提醒、仲裁机制、内存页大小、支持的I/O指令集、使能;
CSTS——控制器状态,包括关机状态、控制器致命错误、就绪状态;
AQA——Admin 队列属性,包括SQ大小和CQ大小;
ASQ——Admin SQ基地址;
ACQ——Admin CQ基地址;
1000h之后的寄存器定义了队列的头、尾DB寄存器。
3.2寄存器理解
CAP寄存器标识的是Controller具有多少能力,而CC寄存器则是指当前Controller选择了哪些能力,可以理解为CC是CAP的一个子集;如果重启(reset)的话,可以更换CC配置;
CC.EN置一,表示Controller已经可以开始处理NVM命令,从1到0表示Controller重启;
CC.EN与CSTS.RDY关系密切,CSTS.RDY总是在CC.EN之后由Controller改变,其他不符合执行顺序的操作都将产生未定义的行为;
Admin队列有host直接创建,AQA、ASQ、ACQ三个寄存器标识了Admin队列,而其他I/O队列则有Admin命令创建(eg,创建I/O CQ命令);
Admin队列的头、尾DB寄存器标识为0,其他I/O队列标识由host按照一定规则分配;只有16bit的有效位,是因为队列深度最大64K。
实际的物理设备CAP.DSTRD值为0,dev->db_stride为1,之后分析中默认db_stride为1
                        
原文链接:https://blog.csdn.net/qq_39021670/article/details/114896973

由dev->dbs使用方式可知,每一个DB寄存器对,前4个字节为SQ Tail DB,后四个字节为CQ Head DB

/** Write sq tail if we are asked to, or if the next command would wrap.*/
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{if (!write_sq) {u16 next_tail = nvmeq->sq_tail + 1;if (next_tail == nvmeq->q_depth)next_tail = 0;if (next_tail != nvmeq->last_sq_tail)return;}if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))//前4字节写入sq tialwritel(nvmeq->sq_tail, nvmeq->q_db);nvmeq->last_sq_tail = nvmeq->sq_tail;
}
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{u16 head = nvmeq->cq_head;//后4字节写入 cq headif (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,nvmeq->dbbuf_cq_ei))writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

static irqreturn_t nvme_irq(int irq, void *data)
{struct nvme_queue *nvmeq = data;irqreturn_t ret = IRQ_NONE;u16 start, end;/** The rmb/wmb pair ensures we see all updates from a previous run of* the irq handler, even if that was on another CPU.*/rmb();if (nvmeq->cq_head != nvmeq->last_cq_head)ret = IRQ_HANDLED;//找到当前CQ队列的尾部,并更新cq_headnvme_process_cq(nvmeq, &start, &end, -1);nvmeq->last_cq_head = nvmeq->cq_head;wmb();if (start != end) {// 依次处理CQ队列中的请求nvme_complete_cqes(nvmeq, start, end);return IRQ_HANDLED;}return ret;
}

依次取出ssd 中已经返回的数据,然后写入cq 的head 到Doorbell 寄存器

static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,u16 *end, unsigned int tag)
{int found = 0;*start = nvmeq->cq_head;while (nvme_cqe_pending(nvmeq)) {if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)found++;nvme_update_cq_head(nvmeq);}*end = nvmeq->cq_head;if (*start != *end)nvme_ring_cq_doorbell(nvmeq);return found;
}

static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{u16 head = nvmeq->cq_head;if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,nvmeq->dbbuf_cq_ei))writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

依次处理cq 中的数据返回给block 层

static inline void nvme_end_request(struct request *req, __le16 status,union nvme_result result)
{struct nvme_request *rq = nvme_req(req);rq->status = le16_to_cpu(status) >> 1;rq->result = result;/* inject error when permitted by fault injection framework */nvme_should_fail(req);//block 请求返回blk_mq_complete_request(req);
}static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];struct request *req;/** AEN requests are special as they don't time out and can* survive any kind of queue freeze and often don't respond to* aborts.  We don't even bother to allocate a struct request* for them but rather special case them here.*/if (unlikely(nvmeq->qid == 0 &&cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {nvme_complete_async_event(&nvmeq->dev->ctrl,cqe->status, &cqe->result);return;}//将通过tag 将reqeust 转换出来req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);if (unlikely(!req)) {dev_warn(nvmeq->dev->ctrl.device,"invalid id %d completed on queue %d\n",cqe->command_id, le16_to_cpu(cqe->sq_id));return;}trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);nvme_end_request(req, cqe->status, cqe->result);
}static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
{while (start != end) {nvme_handle_cqe(nvmeq, start);if (++start == nvmeq->q_depth)start = 0;}
}

static const struct blk_mq_ops nvme_mq_admin_ops = {.queue_rq       = nvme_queue_rq,.complete       = nvme_pci_complete_rq,.init_hctx      = nvme_admin_init_hctx,.init_request   = nvme_init_request,.timeout        = nvme_timeout,
};static const struct blk_mq_ops nvme_mq_ops = {.queue_rq       = nvme_queue_rq,.complete       = nvme_pci_complete_rq,.commit_rqs     = nvme_commit_rqs,.init_hctx      = nvme_init_hctx,.init_request   = nvme_init_request,.map_queues     = nvme_pci_map_queues,.timeout        = nvme_timeout,.poll           = nvme_poll,
};
 

admin  queue

nvme_queue_rq

io queue 

nvme_queue_rq

nvme_commit_rqs

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,struct request *rq,blk_qc_t *cookie, bool last)
{struct request_queue *q = rq->q;struct blk_mq_queue_data bd = {.rq = rq,.last = last,};blk_qc_t new_cookie;blk_status_t ret;new_cookie = request_to_qc_t(hctx, rq);/** For OK queue, we are done. For error, caller may kill it.* Any other error (busy), just add it to our list as we* previously would have done.*/ret = q->mq_ops->queue_rq(hctx, &bd);switch (ret) {case BLK_STS_OK:blk_mq_update_dispatch_busy(hctx, false);*cookie = new_cookie;break;case BLK_STS_RESOURCE:case BLK_STS_DEV_RESOURCE:blk_mq_update_dispatch_busy(hctx, true);__blk_mq_requeue_request(rq);break;default:blk_mq_update_dispatch_busy(hctx, false);*cookie = BLK_QC_T_NONE;break;}return ret;
}

*/
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,bool got_budget)
{struct blk_mq_hw_ctx *hctx;struct request *rq, *nxt;bool no_tag = false;int errors, queued;blk_status_t ret = BLK_STS_OK;bool no_budget_avail = false;if (list_empty(list))return false;WARN_ON(!list_is_singular(list) && got_budget);/** Now process all the entries, sending them to the driver.*/errors = queued = 0;do {struct blk_mq_queue_data bd;rq = list_first_entry(list, struct request, queuelist);hctx = rq->mq_hctx;if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {blk_mq_put_driver_tag(rq);no_budget_avail = true;break;}if (!blk_mq_get_driver_tag(rq)) {/** The initial allocation attempt failed, so we need to* rerun the hardware queue when a tag is freed. The* waitqueue takes care of that. If the queue is run* before we add this entry back on the dispatch list,* we'll re-run it below.*/if (!blk_mq_mark_tag_wait(hctx, rq)) {blk_mq_put_dispatch_budget(hctx);/** For non-shared tags, the RESTART check* will suffice.*/if (hctx->flags & BLK_MQ_F_TAG_SHARED)no_tag = true;break;}}list_del_init(&rq->queuelist);bd.rq = rq;/** Flag last if we have no more requests, or if we have more* but can't assign a driver tag to it.*/if (list_empty(list))bd.last = true;else {nxt = list_first_entry(list, struct request, queuelist);bd.last = !blk_mq_get_driver_tag(nxt);}//下发ioret = q->mq_ops->queue_rq(hctx, &bd);if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {blk_mq_handle_dev_resource(rq, list);break;}if (unlikely(ret != BLK_STS_OK)) {errors++;blk_mq_end_request(rq, BLK_STS_IOERR);continue;}queued++;} while (!list_empty(list));hctx->dispatched[queued_to_index(queued)]++;/** Any items that need requeuing? Stuff them into hctx->dispatch,* that is where we will continue on next queue run.*/if (!list_empty(list)) {bool needs_restart;/** If we didn't flush the entire list, we could have told* the driver there was more coming, but that turned out to* be a lie.*/if (q->mq_ops->commit_rqs)//nvme io commitq->mq_ops->commit_rqs(hctx);spin_lock(&hctx->lock);list_splice_tail_init(list, &hctx->dispatch);spin_unlock(&hctx->lock);/** Order adding requests to hctx->dispatch and checking* SCHED_RESTART flag. The pair of this smp_mb() is the one* in blk_mq_sched_restart(). Avoid restart code path to* miss the new added requests to hctx->dispatch, meantime* SCHED_RESTART is observed here.*/smp_mb();/** If SCHED_RESTART was set by the caller of this function and* it is no longer set that means that it was cleared by another* thread and hence that a queue rerun is needed.** If 'no_tag' is set, that means that we failed getting* a driver tag with an I/O scheduler attached. If our dispatch* waitqueue is no longer active, ensure that we run the queue* AFTER adding our entries back to the list.** If no I/O scheduler has been configured it is possible that* the hardware queue got stopped and restarted before requests* were pushed back onto the dispatch list. Rerun the queue to* avoid starvation. Notes:* - blk_mq_run_hw_queue() checks whether or not a queue has*   been stopped before rerunning a queue.* - Some but not all block drivers stop a queue before*   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq*   and dm-rq.** If driver returns BLK_STS_RESOURCE and SCHED_RESTART* bit is set, run queue after a delay to avoid IO stalls* that could otherwise occur if the queue is idle.  We'll do* similar if we couldn't get budget and SCHED_RESTART is set.*/needs_restart = blk_mq_sched_needs_restart(hctx);if (!needs_restart ||(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))blk_mq_run_hw_queue(hctx, true);else if (needs_restart && (ret == BLK_STS_RESOURCE ||no_budget_avail))blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);blk_mq_update_dispatch_busy(hctx, true);return false;} elseblk_mq_update_dispatch_busy(hctx, false);/** If the host/device is unable to accept more work, inform the* caller of that.*/if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)return false;return (queued + errors) != 0;
}

这篇关于NVME Doorbell 寄存器 数据请求时doorbell 处理的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!



http://www.chinasem.cn/article/970278

相关文章

大模型研发全揭秘:客服工单数据标注的完整攻略

在人工智能(AI)领域,数据标注是模型训练过程中至关重要的一步。无论你是新手还是有经验的从业者,掌握数据标注的技术细节和常见问题的解决方案都能为你的AI项目增添不少价值。在电信运营商的客服系统中,工单数据是客户问题和解决方案的重要记录。通过对这些工单数据进行有效标注,不仅能够帮助提升客服自动化系统的智能化水平,还能优化客户服务流程,提高客户满意度。本文将详细介绍如何在电信运营商客服工单的背景下进行

基于MySQL Binlog的Elasticsearch数据同步实践

一、为什么要做 随着马蜂窝的逐渐发展,我们的业务数据越来越多,单纯使用 MySQL 已经不能满足我们的数据查询需求,例如对于商品、订单等数据的多维度检索。 使用 Elasticsearch 存储业务数据可以很好的解决我们业务中的搜索需求。而数据进行异构存储后,随之而来的就是数据同步的问题。 二、现有方法及问题 对于数据同步,我们目前的解决方案是建立数据中间表。把需要检索的业务数据,统一放到一张M

关于数据埋点,你需要了解这些基本知识

产品汪每天都在和数据打交道,你知道数据来自哪里吗? 移动app端内的用户行为数据大多来自埋点,了解一些埋点知识,能和数据分析师、技术侃大山,参与到前期的数据采集,更重要是让最终的埋点数据能为我所用,否则可怜巴巴等上几个月是常有的事。   埋点类型 根据埋点方式,可以区分为: 手动埋点半自动埋点全自动埋点 秉承“任何事物都有两面性”的道理:自动程度高的,能解决通用统计,便于统一化管理,但个性化定

无人叉车3d激光slam多房间建图定位异常处理方案-墙体画线地图切分方案

墙体画线地图切分方案 针对问题:墙体两侧特征混淆误匹配,导致建图和定位偏差,表现为过门跳变、外月台走歪等 ·解决思路:预期的根治方案IGICP需要较长时间完成上线,先使用切分地图的工程化方案,即墙体两侧切分为不同地图,在某一侧只使用该侧地图进行定位 方案思路 切分原理:切分地图基于关键帧位置,而非点云。 理论基础:光照是直线的,一帧点云必定只能照射到墙的一侧,无法同时照到两侧实践考虑:关

使用SecondaryNameNode恢复NameNode的数据

1)需求: NameNode进程挂了并且存储的数据也丢失了,如何恢复NameNode 此种方式恢复的数据可能存在小部分数据的丢失。 2)故障模拟 (1)kill -9 NameNode进程 [lytfly@hadoop102 current]$ kill -9 19886 (2)删除NameNode存储的数据(/opt/module/hadoop-3.1.4/data/tmp/dfs/na

异构存储(冷热数据分离)

异构存储主要解决不同的数据,存储在不同类型的硬盘中,达到最佳性能的问题。 异构存储Shell操作 (1)查看当前有哪些存储策略可以用 [lytfly@hadoop102 hadoop-3.1.4]$ hdfs storagepolicies -listPolicies (2)为指定路径(数据存储目录)设置指定的存储策略 hdfs storagepolicies -setStoragePo

Hadoop集群数据均衡之磁盘间数据均衡

生产环境,由于硬盘空间不足,往往需要增加一块硬盘。刚加载的硬盘没有数据时,可以执行磁盘数据均衡命令。(Hadoop3.x新特性) plan后面带的节点的名字必须是已经存在的,并且是需要均衡的节点。 如果节点不存在,会报如下错误: 如果节点只有一个硬盘的话,不会创建均衡计划: (1)生成均衡计划 hdfs diskbalancer -plan hadoop102 (2)执行均衡计划 hd

【Prometheus】PromQL向量匹配实现不同标签的向量数据进行运算

✨✨ 欢迎大家来到景天科技苑✨✨ 🎈🎈 养成好习惯,先赞后看哦~🎈🎈 🏆 作者简介:景天科技苑 🏆《头衔》:大厂架构师,华为云开发者社区专家博主,阿里云开发者社区专家博主,CSDN全栈领域优质创作者,掘金优秀博主,51CTO博客专家等。 🏆《博客》:Python全栈,前后端开发,小程序开发,人工智能,js逆向,App逆向,网络系统安全,数据分析,Django,fastapi

烟火目标检测数据集 7800张 烟火检测 带标注 voc yolo

一个包含7800张带标注图像的数据集,专门用于烟火目标检测,是一个非常有价值的资源,尤其对于那些致力于公共安全、事件管理和烟花表演监控等领域的人士而言。下面是对此数据集的一个详细介绍: 数据集名称:烟火目标检测数据集 数据集规模: 图片数量:7800张类别:主要包含烟火类目标,可能还包括其他相关类别,如烟火发射装置、背景等。格式:图像文件通常为JPEG或PNG格式;标注文件可能为X

【生成模型系列(初级)】嵌入(Embedding)方程——自然语言处理的数学灵魂【通俗理解】

【通俗理解】嵌入(Embedding)方程——自然语言处理的数学灵魂 关键词提炼 #嵌入方程 #自然语言处理 #词向量 #机器学习 #神经网络 #向量空间模型 #Siri #Google翻译 #AlexNet 第一节:嵌入方程的类比与核心概念【尽可能通俗】 嵌入方程可以被看作是自然语言处理中的“翻译机”,它将文本中的单词或短语转换成计算机能够理解的数学形式,即向量。 正如翻译机将一种语言