本文主要是介绍RDMA驱动学习(一)- 用户态到内核态的过程,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
最近梳理了一下rdma用户态到内核态传参的流程,会基于ibv_create_cq接口介绍一下ioctl版本的流程,代码基于mlnx-ofa_kernel-5.4。
用户态
用户态和内核态传的参数包含两部分,用户执行create_cq会传一些标准的参数,比如队列长度cqe,ibv_comp_channel channel等,还有另外厂商自己的参数,比如mlx5会传cq buffer的地址等。
用户态中首先是通过cmd记录厂商特有的参数,但是用户态和内核态实际进行传参是通过ibv_command_buffer,会通过attr记录每个参数,因此cmd之后会被转成ibv_command_buffer传给内核。
ibv_command_buffer
create_cq首先初始化了一个ibv_command_buffer driver_attrs
static struct ibv_cq_ex *create_cq(struct ibv_context *context,const struct ibv_cq_init_attr_ex *cq_attr,int cq_alloc_flags,struct mlx5dv_cq_init_attr *mlx5cq_attr)
{DECLARE_COMMAND_BUFFER_LINK(driver_attrs, UVERBS_OBJECT_CQ,UVERBS_METHOD_CQ_CREATE, 1,NULL);...
}
ibv_command_buffer用于存放各种属性,会被组织成为一个链表,next指向下一个ibv_command_buffer,ib_uverbs_ioctl_hdr中记录了所有的attr
struct ibv_command_buffer {struct ibv_command_buffer *next;struct ib_uverbs_attr *next_attr;struct ib_uverbs_attr *last_attr;/** Used by the legacy write interface to keep track of where the UHW* buffer is located and the 'headroom' space that the common code* uses to construct the command header and common command struct* directly before the drivers' UHW.*/uint8_t uhw_in_idx;uint8_t uhw_out_idx;uint8_t uhw_in_headroom_dwords;uint8_t uhw_out_headroom_dwords;uint8_t buffer_error:1;/** These flags control what execute_ioctl_fallback does if the kernel* does not support ioctl*/uint8_t fallback_require_ex:1;uint8_t fallback_ioctl_only:1;struct ib_uverbs_ioctl_hdr hdr;
};struct ib_uverbs_ioctl_hdr {__u16 length;__u16 object_id;__u16 method_id;__u16 num_attrs;__aligned_u64 reserved1;__u32 driver_id;__u32 reserved2;struct ib_uverbs_attr attrs[];
};struct ib_uverbs_attr {__u16 attr_id; /* command specific type attribute */__u16 len; /* only for pointers and IDRs array */__u16 flags; /* combination of UVERBS_ATTR_F_XXXX */union { struct {__u8 elem_id;__u8 reserved;} enum_data;__u16 reserved;} attr_data;union {/** ptr to command, inline data, idr/fd or* ptr to __u32 array of IDRs*/__aligned_u64 data;/* Used by FD_IN and FD_OUT */__s64 data_s64;};
};
然后看下driver_attrs是怎么创建的
#define DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \_link) \const unsigned int __##_name##total = \_ioctl_final_num_attrs(_num_attrs, _link); \struct ibv_command_buffer _name[_IOCTL_NUM_CMDB(__##_name##total)]; \int __attribute__((unused)) __##_name##dummy = _ioctl_init_cmdb( \_name, _object_id, _method_id, __##_name##total, _link)
_ioctl_final_num_attrs就是遍历ibv_command_buffer链表,累计所有的num_attr,不过此时ibv_command_buffer为null,因此就是_num_attrs为1。
#define _ioctl_final_num_attrs(_num_attrs, _link) \((__builtin_constant_p(!(_link)) && !(_link)) \? (_num_attrs) \: __ioctl_final_num_attrs(_num_attrs, _link))unsigned int __ioctl_final_num_attrs(unsigned int num_attrs,struct ibv_command_buffer *link)
{for (; link; link = link->next)num_attrs += link->next_attr - link->hdr.attrs;return num_attrs;
}
hdr里的attrs为变长的柔性数组,所以_IOCTL_NUM_CMDB会计算有几个ibv_command_buffer才能足够存的下_num_attrs个attr。
#define _IOCTL_NUM_CMDB(_num_attrs) \((sizeof(struct ibv_command_buffer) + \sizeof(struct ib_uverbs_attr) * (_num_attrs) + \sizeof(struct ibv_command_buffer) - 1) / \sizeof(struct ibv_command_buffer))
然后申请数组并初始化,设置hdr的object_id和method_id,将当前ibv_command_buffer通过next链接link,next_attr指向attrs的第一个,last_attr指向attrs的第num_attrs个
#define _COMMAND_BUFFER_INIT(_hdr, _object_id, _method_id, _num_attrs, _link) \((struct ibv_command_buffer){ \.hdr = \{ \.object_id = (_object_id), \.method_id = (_method_id), \}, \.next = _link, \.uhw_in_idx = _UHW_NO_INDEX, \.uhw_out_idx = _UHW_NO_INDEX, \.next_attr = (_hdr).attrs, \.last_attr = (_hdr).attrs + _num_attrs})static inline int _ioctl_init_cmdb(struct ibv_command_buffer *cmd,uint16_t object_id, uint16_t method_id,size_t num_attrs,struct ibv_command_buffer *link)
{ *cmd = _COMMAND_BUFFER_INIT(cmd->hdr, object_id, method_id, num_attrs,link);return 0;
}
cmd
创建一个mlx5_create_cq_ex cmd_ex,然后开始设置cmd_ex的mlx5_ib_create_cq部分,记录厂商特有的桉树,比如cq buffer地址等。
static struct ibv_cq_ex *create_cq(struct ibv_context *context,const struct ibv_cq_init_attr_ex *cq_attr,int cq_alloc_flags,struct mlx5dv_cq_init_attr *mlx5cq_attr)
{...struct mlx5_create_cq_ex cmd_ex = {};struct mlx5_create_cq_ex_resp resp_ex = {};struct mlx5_ib_create_cq *cmd_drv;struct mlx5_ib_create_cq_resp *resp_drv;...cmd_drv = &cmd_ex.drv_payload;resp_drv = &resp_ex.drv_payload;...cmd_drv->buf_addr = (uintptr_t) cq->buf_a.buf;cmd_drv->db_addr = (uintptr_t) cq->dbrec;cmd_drv->cqe_size = cqe_sz;...{struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr;cq_attr_ex.cqe = ncqe - 1;ret = ibv_cmd_create_cq_ex2(context, &cq_attr_ex, &cq->verbs_cq,&cmd_ex.ibv_cmd, sizeof(cmd_ex),&resp_ex.ibv_resp, sizeof(resp_ex),CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX,driver_attrs);}...
}
然后看下mlx5_create_cq_ex和mlx5_create_cq_ex_resp是怎么来的,以mlx5_create_cq_ex为例,drv_payload为mlx5_ib_create_cq
DECLARE_DRV_CMD(mlx5_create_cq_ex, IB_USER_VERBS_EX_CMD_CREATE_CQ,mlx5_ib_create_cq, mlx5_ib_create_cq_resp);#define DECLARE_DRV_CMD(_name, _enum, _kabi_req, _kabi_resp) \struct _name { \IBV_ABI_REQ(_enum) ibv_cmd; \union { \_STRUCT_##_kabi_req; \struct _kabi_req drv_payload; \}; \}; \
struct mlx5_ib_create_cq {__aligned_u64 buf_addr;__aligned_u64 db_addr;__u32 cqe_size;__u8 cqe_comp_en;__u8 cqe_comp_res_format;__u16 flags;__u16 uar_page_index;__u16 reserved0;__u32 reserved1;
};
接着看下ibv_cmd是怎么来的,定义如下,包含ex_hdr和_kabi,_kabi就是ib_uverbs_ex_create_cq。
然后对ibv_create_cq_ex进行typedef,因此ibv_cmd就是ibv_create_cq_ex 。
DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_CQ, ibv_create_cq_ex, ib_uverbs_ex_create_cq);#define DECLARE_CMD_EX(_enum, _name, _kabi) \DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi##_resp)#define DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi_resp) \struct _name { \struct ex_hdr hdr; \union { \_STRUCT_##_kabi; \struct _kabi core_payload; \}; \}; \typedef struct _name IBV_ABI_REQ(_enum); \typedef struct _kabi IBV_KABI_REQ(_enum); \typedef struct _kabi_resp IBV_KABI_RESP(_enum); \
struct ib_uverbs_ex_create_cq {__aligned_u64 user_handle;__u32 cqe; __u32 comp_vector;__s32 comp_channel;__u32 comp_mask;__u32 flags; /* bitmask of ib_uverbs_ex_create_cq_flags */__u32 reserved;
};
设置完drv_payload后执行ibv_cmd_create_cq_ex2
int ibv_cmd_create_cq_ex2(struct ibv_context *context,const struct ibv_cq_init_attr_ex *cq_attr,struct verbs_cq *cq,struct ibv_create_cq_ex *cmd,size_t cmd_size,struct ib_uverbs_ex_create_cq_resp *resp,size_t resp_size,uint32_t cmd_flags,struct ibv_command_buffer *driver)
{DECLARE_CMD_BUFFER_LINK_COMPAT(cmdb, UVERBS_OBJECT_CQ,UVERBS_METHOD_CQ_CREATE,driver, cmd, cmd_size, resp, resp_size);return ibv_icmd_create_cq_ex(context, cq_attr, cq, cmdb, cmd_flags);
}
初始化一个ibv_command_buffer cmdb,链接到driver_attrs的前边,然后执行_write_set_uhw
#define DECLARE_CMD_BUFFER_LINK_COMPAT(_name, _object_id, _method_id, \_link, cmd, cmd_size, \resp, resp_size) \DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, 2, _link); \_write_set_uhw(_name, cmd, sizeof(*cmd), cmd_size, resp, \sizeof(*resp), resp_size)
前边说到需要将cmd转成ibv_command_buffer,就是通过_write_set_uhw做的,core_req_size为ibv_create_cq_ex的大小,req_size为mlx5_create_cq_ex的大小,因此这里的fill_attr_in就是将mlx5_create_cq_ex中drv_payload的地址记录到cmdb的attrs中,uhw_in_idx表示记录到attrs的第几个,这样就将cmd的地址作为一个attr记录到了ibv_command_buffer里。
void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req,size_t core_req_size, size_t req_size, void *resp,size_t core_resp_size, size_t resp_size)
{if (req && core_req_size < req_size) {if (VERBS_IOCTL_ONLY)cmdb->uhw_in_idx =fill_attr_in(cmdb, UVERBS_ATTR_UHW_IN,(uint8_t *)req + core_req_size,req_size - core_req_size) -cmdb->hdr.attrs;else cmdb->uhw_in_idx =_fill_attr_in_uhw(cmdb, UVERBS_ATTR_UHW_IN,(uint8_t *)req +core_req_size,req_size - core_req_size) -cmdb->hdr.attrs;cmdb->uhw_in_headroom_dwords = __check_divide(core_req_size, 4);}...
}static inline struct ib_uverbs_attr *
_fill_attr_in_uhw(struct ibv_command_buffer *cmd, uint16_t attr_id,const void *data, size_t len)
{struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id);if (unlikely(len > UINT16_MAX))cmd->buffer_error = 1;attr->len = len;attr->data = ioctl_ptr_to_u64(data);return attr;
}
这里又创建了一个新的cmdb,链接到之前的cmdb前,设置各种参数到cmdb的attrs中,然后执行execute_ioctl_fallback
static int ibv_icmd_create_cq(struct ibv_context *context, int cqe,struct ibv_comp_channel *channel, int comp_vector,uint32_t flags, struct ibv_cq *cq,struct ibv_command_buffer *link,uint32_t cmd_flags)
{DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 8, link);struct verbs_ex_private *priv = get_priv(context);struct ib_uverbs_attr *handle;struct ib_uverbs_attr *async_fd_attr;uint32_t resp_cqe;int ret;cq->context = context;handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_CQ_HANDLE);fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &resp_cqe);fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_CQE, cqe);fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_CQ_USER_HANDLE, (uintptr_t)cq);if (channel)fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, channel->fd);fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, comp_vector);async_fd_attr = fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_EVENT_FD, context->async_fd);if (priv->imported)fallback_require_ioctl(cmdb);else/* Prevent fallback to the 'write' mode if kernel doesn't support it */attr_optional(async_fd_attr);if (flags) {if ((flags & ~IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) ||(!(cmd_flags & CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX)))fallback_require_ex(cmdb);fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_FLAGS, flags);}switch (execute_ioctl_fallback(cq->context, create_cq, cmdb, &ret)) {...
}
int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd)
{struct verbs_context *vctx = verbs_get_ctx(context);prepare_attrs(cmd);cmd->hdr.length = sizeof(cmd->hdr) +sizeof(cmd->hdr.attrs[0]) * cmd->hdr.num_attrs;cmd->hdr.reserved1 = 0;cmd->hdr.reserved2 = 0;cmd->hdr.driver_id = vctx->priv->driver_id;if (ioctl(context->cmd_fd, RDMA_VERBS_IOCTL, &cmd->hdr))return errno;finalize_attrs(cmd);return 0;
}
prepare_attrs遍历所有的ibv_command_buffer,将所有的attrs打平到第一个buffer里,然后设置hdr中的长度,最后执行ioctl,用户态的逻辑就完成了。
static void prepare_attrs(struct ibv_command_buffer *cmd)
{struct ib_uverbs_attr *end = cmd->next_attr;struct ibv_command_buffer *link;for (link = cmd->next; link; link = link->next) {struct ib_uverbs_attr *cur;assert(cmd->hdr.object_id == link->hdr.object_id);assert(cmd->hdr.method_id == link->hdr.method_id);/** Keep track of where the uhw_in lands in the final array if* we copy it from a link*/if (!VERBS_IOCTL_ONLY && link->uhw_in_idx != _UHW_NO_INDEX) {assert(cmd->uhw_in_idx == _UHW_NO_INDEX);cmd->uhw_in_idx =link->uhw_in_idx + (end - cmd->hdr.attrs);}for (cur = link->hdr.attrs; cur != link->next_attr; cur++)*end++ = *cur;assert(end <= cmd->last_attr);}cmd->hdr.num_attrs = end - cmd->hdr.attrs;if (!VERBS_IOCTL_ONLY && cmd->uhw_in_idx != _UHW_NO_INDEX) {struct ib_uverbs_attr *uhw = &cmd->hdr.attrs[cmd->uhw_in_idx];assert(uhw->attr_id == UVERBS_ATTR_UHW_IN);if (uhw->len <= sizeof(uhw->data))memcpy(&uhw->data, (void *)(uintptr_t)uhw->data,uhw->len);}
}
内核态
初始化
内核有object,method,attr三个概念,cq就对应一个object,其他比如qp,mr都对应不同的object,通过object_id区分;cq这个object有多个method,比如create_cq,destroy_cq,通过method_id区分;create_cq需要多个参数,比如ceq,这里每个参数就是一个attr,通过attr_id区分。
首先定义create_cq的method,method_id为UVERBS_METHOD_CQ_CREATE,这个method的handler为UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE),即实际会执行的函数。
DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE,UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE,UVERBS_OBJECT_CQ,UVERBS_ACCESS_NEW,UA_MANDATORY),UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,UVERBS_ATTR_TYPE(u32),UA_MANDATORY),UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,UVERBS_ATTR_TYPE(u64),UA_MANDATORY),......UVERBS_ATTR_UHW());struct uverbs_method_def {u16 id;/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */u32 flags;size_t num_attrs;const struct uverbs_attr_def * const (*attrs)[];int (*handler)(struct uverbs_attr_bundle *attrs);
};#define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...) \static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \_method_id)[] = { __VA_ARGS__ }; \static const struct uverbs_method_def UVERBS_METHOD(_method_id) = { \.id = _method_id, \.handler = UVERBS_HANDLER(_method_id), \.num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)), \.attrs = &UVERBS_METHOD_ATTRS(_method_id), \}
然后看下method的attr,对于用户态传进来的一个attr,内核应该如何去解析这块内存,就是通过uverbs_attr_spec,指示内核去解析用户attr的什么字段。
struct uverbs_attr_def {u16 id;struct uverbs_attr_spec attr;
};struct uverbs_attr_spec {u8 type;/** Support extending attributes by length. Allow the user to provide* more bytes than ptr.len, but check that everything after is zero'd* by the user.*/u8 zero_trailing:1;/** Valid only for PTR_IN. Allocate and copy the data inside* the parser*/u8 alloc_and_copy:1;u8 mandatory:1;/* True if this is from UVERBS_ATTR_UHW */u8 is_udata:1;union {struct {/* Current known size to kernel */u16 len;/* User isn't allowed to provide something < min_len */u16 min_len;} ptr;struct {/** higher bits mean the namespace and lower bits mean* the type id within the namespace.*/u16 obj_type;u8 access;} obj;struct {u8 num_elems;} enum_def;} u;/* This weird split lets us remove some padding */union {struct {/** The enum attribute can select one of the attributes* contained in the ids array. Currently only PTR_IN* attributes are supported in the ids array.*/const struct uverbs_attr_spec *ids;} enum_def;struct {/** higher bits mean the namespace and lower bits mean* the type id within the namespace.*/u16 obj_type;u16 min_len;u16 max_len;u8 access;} objs_arr;} u2;
};
以上述定义method时uhw attr为例,就是定义了一个uverbs_attr_def,其中spec的type为UVERBS_ATTR_TYPE_PTR_IN,is_udata为1。
#define UVERBS_ATTR_UHW() \UVERBS_ATTR_PTR_IN(UVERBS_ATTR_UHW_IN, \UVERBS_ATTR_MIN_SIZE(0), \UA_OPTIONAL, \.is_udata = 1), #define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...) \(&(const struct uverbs_attr_def){ \.id = _attr_id, \.attr = { .type = UVERBS_ATTR_TYPE_PTR_IN, \_type, \__VA_ARGS__ } })
然后开始定义cq的object,创建uverbs_object_def,其中methods指向传入的数组,即create_cq和destroy_cq。
DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ,UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),&UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),&UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
);struct uverbs_object_def {u16 id;const struct uverbs_obj_type *type_attrs;size_t num_methods;const struct uverbs_method_def * const (*methods)[];
};#define DECLARE_UVERBS_NAMED_OBJECT(_object_id, _type_attrs, ...) \static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS( \_object_id)[] = { __VA_ARGS__ }; \static const struct uverbs_object_def UVERBS_OBJECT(_object_id) = { \.id = _object_id, \.type_attrs = &_type_attrs, \.num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)), \.methods = &UVERBS_OBJECT_METHODS(_object_id) \}
然后定义cq相关的uapi_definition,就是创建了一个uapi_definition,其中chain_obj_tree指向前边创建的cq object。
const struct uapi_definition uverbs_def_obj_cq[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),{}
};#define UAPI_DEF_CHAIN_OBJ_TREE_NAMED(_object_enum, ...) \UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, &UVERBS_OBJECT(_object_enum), \##__VA_ARGS__)
#define UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, _object_ptr, ...) \{ \.kind = UAPI_DEF_CHAIN_OBJ_TREE, \.object_start = { .object_id = _object_enum }, \.chain_obj_tree = _object_ptr, \}, \##__VA_ARGS__
类似的,将其他的object相关的uapi_definition添加到数组uverbs_core_api中,UAPI_DEF_CHAIN就是新建了一个uapi_definition,然后将chain指向下一个uapi_definition。
static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_async_fd),UAPI_DEF_CHAIN(uverbs_def_obj_counters),UAPI_DEF_CHAIN(uverbs_def_obj_cq),UAPI_DEF_CHAIN(uverbs_def_obj_device),UAPI_DEF_CHAIN(uverbs_def_obj_dm),UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),UAPI_DEF_CHAIN(uverbs_def_obj_intf),UAPI_DEF_CHAIN(uverbs_def_obj_mr),UAPI_DEF_CHAIN(uverbs_def_obj_qp),UAPI_DEF_CHAIN(uverbs_def_obj_srq),UAPI_DEF_CHAIN(uverbs_def_obj_wq),UAPI_DEF_CHAIN(uverbs_def_write_intf),{},
};
#define UAPI_DEF_CHAIN(_def_var) \{ \.kind = UAPI_DEF_CHAIN, .chain = _def_var, \}
到这里就完成了cq相关object,method等的创建,然后开始添加到radix tree。
创建uverbs_api *uapi,uverbs_api用于保存所有的api,内部的radix为radix tree,所有的api会被加入到radix tree中。
然后通过uapi_merge_def添加uverbs_core_api。
struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
{struct uverbs_api *uapi;int rc;uapi = kzalloc(sizeof(*uapi), GFP_KERNEL);if (!uapi)return ERR_PTR(-ENOMEM);INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);uapi->driver_id = ibdev->ops.driver_id;rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);...
}struct uverbs_api {/* radix tree contains struct uverbs_api_* pointers */struct radix_tree_root radix;enum rdma_driver_id driver_id;unsigned int num_write;unsigned int num_write_ex;struct uverbs_api_write_method notsupp_method;const struct uverbs_api_write_method **write_methods;const struct uverbs_api_write_method **write_ex_methods;
};
uapi_merge_def中会遍历数组uverbs_core_api,假设遍历到uverbs_def_obj_cq对应的元素,由于此时的kind为CHAIN,因此递归对chain执行uapi_merge_def。
chain指向uverbs_def_obj_cq,由于kind为UAPI_DEF_CHAIN_OBJ_TREE,于是对chain_obj_tree执行uapi_merge_obj_tree,chain_obj_tree指向的就是cq对应的object。
static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,const struct uapi_definition *def_list,bool is_driver)
{const struct uapi_definition *def = def_list;u32 cur_obj_key = UVERBS_API_KEY_ERR;u32 cur_method_key = UVERBS_API_KEY_ERR;bool exists;int rc;if (!def_list)return 0;for (;; def++) {switch ((enum uapi_definition_kind)def->kind) {case UAPI_DEF_CHAIN:rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);if (rc)return rc;continue;case UAPI_DEF_CHAIN_OBJ_TREE:if (WARN_ON(def->object_start.object_id !=def->chain_obj_tree->id))return -EINVAL;cur_obj_key = uapi_key_obj(def->object_start.object_id);rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,is_driver);if (rc)return rc;continue;case UAPI_DEF_END:return 0;...WARN_ON(true);return -EINVAL;}
}
uapi_merge_obj_tree会将object,method,attr分别插入到radix tree。
这里说下插入radix tree的key如何计算,key一共为16位,key的最低6位为attr_id,中间5位为method_id,高5位为object_id。
然后通过uapi_key_obj获取obeject的key obj_key,就是将object_id左移到高5位,然后通过uapi_add_get_elm插入到radix tree中,slot为obj_elm,然后对obj中的每一个method循环执行uapi_merge_method。
static int uapi_merge_obj_tree(struct uverbs_api *uapi,const struct uverbs_object_def *obj,bool is_driver)
{struct uverbs_api_object *obj_elm;unsigned int i;u32 obj_key;bool exists;int rc; obj_key = uapi_key_obj(obj->id);obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);if (IS_ERR(obj_elm))return PTR_ERR(obj_elm);if (obj->type_attrs) {if (WARN_ON(obj_elm->type_attrs))return -EINVAL;obj_elm->id = obj->id;obj_elm->type_attrs = obj->type_attrs;obj_elm->type_class = obj->type_attrs->type_class;if (WARN_ON(is_driver &&obj->type_attrs->type_class != &uverbs_idr_class &&obj->type_attrs->type_class != &uverbs_fd_class))return -EINVAL;} if (!obj->methods)return 0;for (i = 0; i != obj->num_methods; i++) {const struct uverbs_method_def *method = (*obj->methods)[i];if (!method)continue;rc = uapi_merge_method(uapi, obj_elm, obj_key, method,is_driver);if (rc)return rc; } return 0;
}
然后看下uapi_merge_method,先通过obj_key和mthod->id拼出来method_key,然后继续通过uapi_add_get_elm将method_key插入radix tree,slot为method_elm,然后将handler设置到method_elm中。
然后对于method的所有attr,将index设置为method_key | attr_key,然后也插入radix tree。
static int uapi_merge_method(struct uverbs_api *uapi,struct uverbs_api_object *obj_elm, u32 obj_key,const struct uverbs_method_def *method,bool is_driver)
{u32 method_key = obj_key | uapi_key_ioctl_method(method->id);struct uverbs_api_ioctl_method *method_elm;unsigned int i;bool exists;if (!method->attrs)return 0;method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),&exists);if (IS_ERR(method_elm))return PTR_ERR(method_elm);if (exists) {/** This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE*/if (WARN_ON(method->handler))return -EINVAL;} else {WARN_ON(!method->handler);rcu_assign_pointer(method_elm->handler, method->handler);if (method->handler != uverbs_destroy_def_handler)method_elm->driver_method = is_driver;}for (i = 0; i != method->num_attrs; i++) {const struct uverbs_attr_def *attr = (*method->attrs)[i];struct uverbs_api_attr *attr_slot;if (!attr)continue;/** ENUM_IN contains the 'ids' pointer to the driver's .rodata,* so if it is specified by a driver then it always makes this* into a driver method.*/if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)method_elm->driver_method |= is_driver;/** Like other uobject based things we only support a single* uobject being NEW'd or DESTROY'd*/if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {u8 access = attr->attr.u2.objs_arr.access;if (WARN_ON(access == UVERBS_ACCESS_NEW ||access == UVERBS_ACCESS_DESTROY))return -EINVAL;}attr_slot =uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),sizeof(*attr_slot));/* Attributes are not allowed to be modified by drivers */if (IS_ERR(attr_slot))return PTR_ERR(attr_slot);attr_slot->spec = attr->attr;}return 0;
}
运行
前边用户态已经看到执行了ioctl,将cmd和参数传到了内核态,现在看下内核态如何执行。
通过copy_from_user将参数拷贝到内核态的hdr,注意这时候attr还没拷贝进来,然后执行ib_uverbs_cmd_verbs
long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{struct ib_uverbs_file *file = filp->private_data;struct ib_uverbs_ioctl_hdr __user *user_hdr =(struct ib_uverbs_ioctl_hdr __user *)arg;struct ib_uverbs_ioctl_hdr hdr;int srcu_key;int err;err = copy_from_user(&hdr, user_hdr, sizeof(hdr));...srcu_key = srcu_read_lock(&file->device->disassociate_srcu);err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs);srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);return err;
}
通过object id和method id查找radix tree获得到method_elm。
static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,struct ib_uverbs_ioctl_hdr *hdr,struct ib_uverbs_attr __user *user_attrs)
{const struct uverbs_api_ioctl_method *method_elm;struct uverbs_api *uapi = ufile->device->uapi;struct radix_tree_iter attrs_iter;struct bundle_priv *pbundle;struct bundle_priv onstack;void __rcu **slot;int ret;if (unlikely(hdr->driver_id != uapi->driver_id))return -EINVAL;
#ifdef HAVE_RADIX_TREE_ITER_LOOKUPslot = radix_tree_iter_lookup(&uapi->radix, &attrs_iter,uapi_key_obj(hdr->object_id) |uapi_key_ioctl_method(hdr->method_id));
#elseradix_tree_iter_init(&attrs_iter, uapi_key_obj(hdr->object_id) |uapi_key_ioctl_method(hdr->method_id));slot = radix_tree_next_chunk(&uapi->radix, &attrs_iter, RADIX_TREE_ITER_CONTIG);
#endifif (unlikely(!slot))return -EPROTONOSUPPORT;method_elm = rcu_dereference_protected(*slot, true);...
}
bundle_priv用于存储所有用户传进来的参数,由于不通method的attrs个数不一样,因此需要动态分配内存存储attr,为了优化小的分配,bundle_priv内部预留了栈上的internal_buffer,后续假设internal_buffer是足够的,即use_stack为1。
然后设置pbundle的各个参数,比如method_elm,method_key,其中user_attrs指向了用户传入的attr。
static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,struct ib_uverbs_ioctl_hdr *hdr,struct ib_uverbs_attr __user *user_attrs)
{struct bundle_priv *pbundle;struct bundle_priv onstack;...if (!method_elm->use_stack) {pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);if (!pbundle)return -ENOMEM;pbundle->internal_avail =method_elm->bundle_size -offsetof(struct bundle_priv, internal_buffer);pbundle->alloc_head.next = NULL;pbundle->allocated_mem = &pbundle->alloc_head;} else {pbundle = &onstack;pbundle->internal_avail = sizeof(pbundle->internal_buffer);pbundle->allocated_mem = NULL;}/* Space for the pbundle->bundle.attrs flex array */pbundle->method_elm = method_elm;pbundle->method_key = attrs_iter.index;pbundle->bundle.ufile = ufile;pbundle->bundle.context = NULL; /* only valid if bundle has uobject */pbundle->radix = &uapi->radix;pbundle->radix_slots = slot;pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);pbundle->user_attrs = user_attrs;pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *sizeof(*pbundle->bundle.attrs),sizeof(*pbundle->internal_buffer));...ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);bundle_destroy(pbundle, ret == 0);return ret;
}
然后执行ib_uverbs_run_method,将method的handler保存到handler,通过用户的num_attrs可以知道需要的内存大小uattrs_size,然后执行uverbs_alloc分配内存到uattrs,最后通过copy_from_user将用户的attr拷贝到uattrs。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,unsigned int num_attrs)
{int (*handler)(struct uverbs_attr_bundle *attrs);size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;unsigned int i;int ret;/* See uverbs_disassociate_api() */handler = srcu_dereference(pbundle->method_elm->handler,&pbundle->bundle.ufile->device->disassociate_srcu);if (!handler)return -EIO;pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);if (IS_ERR(pbundle->uattrs))return PTR_ERR(pbundle->uattrs);if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))return -EFAULT;...
}
然后通过uverbs_set_attr解析用户的attr。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,unsigned int num_attrs)
{...for (i = 0; i != num_attrs; i++) {ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]);if (unlikely(ret)) return ret;}...
}
首先通过uapi_get_attr_for_method查找radix tree中的attr,然后执行uverbs_process_attr
static int uverbs_set_attr(struct bundle_priv *pbundle,struct ib_uverbs_attr *uattr)
{ u32 attr_key = uapi_key_attr(uattr->attr_id);u32 attr_bkey = uapi_bkey_attr(attr_key);const struct uverbs_api_attr *attr;void __rcu **slot;int ret;slot = uapi_get_attr_for_method(pbundle, attr_key);if (!slot) {/** Kernel does not support the attribute but user-space says it* is mandatory*/if (uattr->flags & UVERBS_ATTR_F_MANDATORY)return -EPROTONOSUPPORT;return 0;}attr = rcu_dereference_protected(*slot, true);/* Reject duplicate attributes from user-space */if (test_bit(attr_bkey, pbundle->bundle.attr_present))return -EINVAL;ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey);if (ret)return ret;__set_bit(attr_bkey, pbundle->bundle.attr_present);return 0;
}
uverbs_process_attr就是根据spec中的type解析对应嗯字段到pbundle的attrs,下边展示了type为UVERBS_ATTR_TYPE_PTR_OUT的场景。
static int uverbs_process_attr(struct bundle_priv *pbundle,const struct uverbs_api_attr *attr_uapi,struct ib_uverbs_attr *uattr, u32 attr_bkey)
{const struct uverbs_attr_spec *spec = &attr_uapi->spec;struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];const struct uverbs_attr_spec *val_spec = spec;struct uverbs_obj_attr *o_attr;switch (spec->type) {...case UVERBS_ATTR_TYPE_PTR_OUT:if (uattr->len < val_spec->u.ptr.min_len ||(!val_spec->zero_trailing &&uattr->len > val_spec->u.ptr.len))return -EINVAL;if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&uattr->attr_data.reserved)return -EINVAL;e->ptr_attr.uattr_idx = uattr - pbundle->uattrs;e->ptr_attr.len = uattr->len;...e->ptr_attr.data = uattr->data;...break;...}
}
然后回到ib_uverbs_run_method,前边说到uhw数据会作为一个attr传进来,这里会通过uverbs_fill_udata将uhw的指针记录到driver_udata,最后执行handler就到了真正create_cq的逻辑。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,unsigned int num_attrs)
{...if (pbundle->method_elm->has_udata)uverbs_fill_udata(&pbundle->bundle,&pbundle->bundle.driver_udata,UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);elsepbundle->bundle.driver_udata = (struct ib_udata){};if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {struct uverbs_obj_attr *destroy_attr =&pbundle->bundle.attrs[destroy_bkey].obj_attr;ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);if (ret)return ret;__clear_bit(destroy_bkey, pbundle->uobj_finalize);ret = handler(&pbundle->bundle);uobj_put_destroy(destroy_attr->uobject);} else {ret = handler(&pbundle->bundle);}...
}
最后感谢一下学习rdma过程中几位大佬的答疑(字典序)
Santiago0826,zhigang124以及一位不想透露姓名的大佬
这篇关于RDMA驱动学习(一)- 用户态到内核态的过程的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!