本文主要是介绍Linux block_device gendisk和hd_struct到底是个啥关系,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
本文的源码版本是Linux 5.15版本,有图有真相:
1.先从块设备驱动说起
安卓平台有一个非常典型和重要的块设备驱动:zram,我们来看一下zram这个块设备驱动加载初始化和swapon的逻辑,完整梳理完这个逻辑将对Linux块设备驱动模型有深入的理解。
zram驱动加载的时候会调用zram_add函数,源码如下:
1887/*
1888 * Allocate and initialize new zram device. the function returns
1889 * '>= 0' device_id upon success, and negative value otherwise.
1890 */
1891static int zram_add(void)
1892{
1893 struct zram *zram;
1894 int ret, device_id;
1895
1896 zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1909 ...
1910 /* gendisk structure */
1911 zram->disk = blk_alloc_disk(NUMA_NO_NODE);
1
1918
1919 zram->disk->major = zram_major;
1920 zram->disk->first_minor = device_id;
1921 zram->disk->minors = 1;
1922 zram->disk->fops = &zram_devops;
1923 zram->disk->private_data = zram;
1924 snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1925 ...
1957 device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
1961 zram_debugfs_register(zram);
1962 pr_info("Added device: %s\n", zram->disk->disk_name);
1963 return device_id;...
1970}
zram_add中有两个非常重要的函数:
- blk_alloc_disk
- device_add_disk
上面两个函数描述了块设备驱动的两个步骤:1)创建gendisk对象,代表的是一个“磁盘” 2)注册和激活磁盘,激活磁盘之后就可以正式使用了。
2. gendisk和hd_struct是啥
blk_alloc_disk函数创建了一个gendisk对象,这就出现了本文要讲述的非常重要的对象。怎么理解gendisk呢?Linux用gendisk代表一个“磁盘”,这里的磁盘可以是一个真实的硬盘,也可以是一个虚拟设备。
我们接触windows系统比较多,硬盘都会划分分区,在Linux是不是也有同样的概念呢?确实如此,这就要去struct gendisk数据结构来一探究竟了:
121struct gendisk {
122 /* major, first_minor and minors are input parameters only,
123 * don't use directly. Use disk_devt() and disk_max_parts().
124 */
125 int major; /* major number of driver */
126 int first_minor;
127 int minors; /* maximum number of minors, =1 for
128 * disks that can't be partitioned. */
129
130 char disk_name[DISK_NAME_LEN]; /* name of major driver */
131
132 unsigned short events; /* supported events */
133 unsigned short event_flags; /* flags related to event processing */
134
135 struct xarray part_tbl;
136 struct block_device *part0;
137
138 const struct block_device_operations *fops;
139 struct request_queue *queue;...}
gendisk中的part_tbl就代表该磁盘的分区表,那么每个分区用什么结构体表示:struct hd_struct
注意gendisk结构体中还有一个重要的fops函数,代表了操作该块设备的操作函数列表,具体本文后面详细讲述。
3. gendisk是怎么创建的
前面知道gendisk是通过blk_alloc_disk函数创建的:
275#define blk_alloc_disk(node_id) \
276({ \
277 static struct lock_class_key __key; \
278 \
279 __blk_alloc_disk(node_id, &__key); \
280})1333struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
1334{
1335 struct request_queue *q;
1336 struct gendisk *disk;
1337
1338 q = blk_alloc_queue(node);
1339 if (!q)
1340 return NULL;
1341
1342 disk = __alloc_disk_node(q, node, lkclass);
1343 if (!disk) {
1344 blk_cleanup_queue(q);
1345 return NULL;
1346 }
1347 return disk;
1348}1279
1280struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
1281 struct lock_class_key *lkclass)
1282{
1283 struct gendisk *disk;
1284
1285 if (!blk_get_queue(q))
1286 return NULL;
1287
1288 disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1289 if (!disk)
1290 goto out_put_queue;
1291
1292 disk->bdi = bdi_alloc(node_id);
1293 if (!disk->bdi)
1294 goto out_free_disk;
1295
1296 disk->part0 = bdev_alloc(disk, 0);
1297 if (!disk->part0)
1298 goto out_free_bdi;
1299
1300 disk->node_id = node_id;
1301 mutex_init(&disk->open_mutex);
1302 xa_init(&disk->part_tbl);
1303 if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
1304 goto out_destroy_part_tbl;
1305
总结一下调用关系:blk_alloc_disk->__alloc_disk_node->__alloc_disk_node,最终__alloc_disk_node创建了gendisk对象。
创建了gendisk之后,要给gendisk做一些初始化赋值,其中很重要的part0是block_device,通过调用bdev_alloc(disk,0)创建,这里出现了本文最后一个要介绍的对象:struct block_device,
4 block_device是啥
block_device具体可以对应一个磁盘,也可以对应磁盘里面的一个分区,也就说磁盘和磁盘都可用block_device表示,block_device可以想象成磁盘的描述信息,比如设备号,分区号,是否只读等等,具体定义如下:
4struct block_device {
25 sector_t bd_start_sect;
26 struct disk_stats __percpu *bd_stats;
27 unsigned long bd_stamp;
28 bool bd_read_only; /* read-only policy */
29 dev_t bd_dev;
30 int bd_openers;
31 struct inode * bd_inode; /* will die */
32 struct super_block * bd_super;
33 void * bd_claiming;
34 struct device bd_device;
35 void * bd_holder;
36 int bd_holders;
37 bool bd_write_holder;
38 struct kobject *bd_holder_dir;
39 u8 bd_partno;
40 spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
41 struct gendisk * bd_disk;
42
43 /* The counter of freeze processes */
44 int bd_fsfreeze_count;
45 /* Mutex for freeze */
46 struct mutex bd_fsfreeze_mutex;
47 struct super_block *bd_fsfreeze_sb;
48
49 struct partition_meta_info *bd_meta_info;
50#ifdef CONFIG_FAIL_MAKE_REQUEST
51 bool bd_make_it_fail;
52#endif
53
54 ANDROID_KABI_RESERVE(1);
55 ANDROID_KABI_RESERVE(2);
56 ANDROID_KABI_RESERVE(3);
57 ANDROID_KABI_RESERVE(4);
58} __randomize_layout;
5.block_device怎么创建的
bdev_alloc创建了disk->part0这个block_device对象,我们来看下非常重要的bdev_alloc函数:
478struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
479{
480 struct block_device *bdev;
481 struct inode *inode;
482
483 inode = new_inode(blockdev_superblock);
484 if (!inode)
485 return NULL;//块设备文件对应inode设置为块设备
486 inode->i_mode = S_IFBLK;
487 inode->i_rdev = 0;
488 inode->i_data.a_ops = &def_blk_aops;
489 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
490//new_inode创建的本质上是bdev_inode,I_BDEV获取bdev_inode结构体的字段bdev
491 bdev = I_BDEV(inode);
492 mutex_init(&bdev->bd_fsfreeze_mutex);
493 spin_lock_init(&bdev->bd_size_lock);//初始化block_device,设置分区号,inode,gendisk对象
494 bdev->bd_partno = partno;
495 bdev->bd_inode = inode;
496 bdev->bd_stats = alloc_percpu(struct disk_stats);
497 if (!bdev->bd_stats) {
498 iput(inode);
499 return NULL;
500 }
501 bdev->bd_disk = disk;
502 return bdev;
503}
上面new_inode函数调用可以参考:zram压缩机制看swapon系统调用_swapon设置为zram-CSDN博客
总结来讲new_inode返回的本质上是一个bdev_inode对象,其定义如下:
32struct bdev_inode {
33 struct block_device bdev;
34 struct inode vfs_inode;
35};
36
也就是说new_inode创建bdev_inode的同时,本质上也创建了一个block_device对象。这里bdev_inode就代表块设备文件的inode,比如zram驱动来讲,对应的就是/dev/block/zram0块设备文件的inode对象。
6. 激活磁盘
激活磁盘使用的是device_add_disk函数:
/*** device_add_disk - add disk information to kernel list* @parent: parent device for the disk* @disk: per-device partitioning information* @groups: Additional per-device sysfs groups** This function registers the partitioning information in @disk* with the kernel.*/
int device_add_disk(struct device *parent, struct gendisk *disk,const struct attribute_group **groups){struct device *ddev = disk_to_dev(disk);int ret;/** The disk queue should now be all set with enough information about* the device for the elevator code to pick an adequate default* elevator if one is needed, that is, for devices requesting queue* registration.*/elevator_init_mq(disk->queue);/** If the driver provides an explicit major number it also must provide* the number of minors numbers supported, and those will be used to* setup the gendisk.* Otherwise just allocate the device numbers for both the whole device* and all partitions from the extended dev_t space.*/if (disk->major) {if (WARN_ON(!disk->minors))return -EINVAL;if (disk->minors > DISK_MAX_PARTS) {pr_err("block: can't allocate more than %d partitions\n",DISK_MAX_PARTS);disk->minors = DISK_MAX_PARTS;}if (disk->first_minor > MINORMASK ||disk->minors > MINORMASK + 1 ||disk->first_minor + disk->minors > MINORMASK + 1)return -EINVAL;} else {if (WARN_ON(disk->minors))return -EINVAL;ret = blk_alloc_ext_minor();if (ret < 0)return ret;disk->major = BLOCK_EXT_MAJOR;disk->first_minor = ret;disk->flags |= GENHD_FL_EXT_DEVT;}/* delay uevents, until we scanned partition table */dev_set_uevent_suppress(ddev, 1);ddev->parent = parent;ddev->groups = groups;dev_set_name(ddev, "%s", disk->disk_name);if (!(disk->flags & GENHD_FL_HIDDEN))ddev->devt = MKDEV(disk->major, disk->first_minor);//非常重要的函数ret = device_add(ddev);if (ret)goto out_free_ext_minor;ret = disk_alloc_events(disk);if (ret)goto out_device_del;if (!sysfs_deprecated) {ret = sysfs_create_link(block_depr, &ddev->kobj,kobject_name(&ddev->kobj));if (ret)goto out_device_del;}/** avoid probable deadlock caused by allocating memory with* GFP_KERNEL in runtime_resume callback of its all ancestor* devices*/pm_runtime_set_memalloc_noio(ddev, true);ret = blk_integrity_add(disk);if (ret)goto out_del_block_link;disk->part0->bd_holder_dir =kobject_create_and_add("holders", &ddev->kobj);if (!disk->part0->bd_holder_dir) {ret = -ENOMEM;goto out_del_integrity;}disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);if (!disk->slave_dir) {ret = -ENOMEM;goto out_put_holder_dir;}ret = bd_register_pending_holders(disk);if (ret < 0)goto out_put_slave_dir;ret = blk_register_queue(disk);if (ret)goto out_put_slave_dir;if (disk->flags & GENHD_FL_HIDDEN) {/** Don't let hidden disks show up in /proc/partitions,* and don't bother scanning for partitions either.*/disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;disk->flags |= GENHD_FL_NO_PART;} else {ret = bdi_register(disk->bdi, "%u:%u",disk->major, disk->first_minor);if (ret)goto out_unregister_queue;bdi_set_owner(disk->bdi, ddev);ret = sysfs_create_link(&ddev->kobj,&disk->bdi->dev->kobj, "bdi");if (ret)goto out_unregister_bdi;//非常重要bdev_add(disk->part0, ddev->devt);disk_scan_partitions(disk);/** Announce the disk and partitions after all partitions are* created. (for hidden disks uevents remain suppressed forever)*/dev_set_uevent_suppress(ddev, 0);disk_uevent(disk, KOBJ_ADD);}disk_update_readahead(disk);disk_add_events(disk);return 0;...
}
总结起来device_add_disk调用了两个非常重要的函数:
- device_add
- bdev_add
device_add函数
device_add--->devtmpfs_create_node
devtmpfs会给devtmpfs文件系统的线程发送创建块文件的消息,类似mknode,然后再/dev/目录下创建出来块文件。
bdev_add函数:
void bdev_add(struct block_device *bdev, dev_t dev)
{bdev->bd_dev = dev;bdev->bd_inode->i_rdev = dev;bdev->bd_inode->i_ino = dev;insert_inode_hash(bdev->bd_inode);
}
设置block_device->device的bd_dev为块设备号,同时设置block_device->bd_inode的i_rdev为块设备号,同时insert_inode_hash函数 block_device的bd_inode添加到superblock的inode hash表中,这里逻辑非常重要,对理解swapon非常重要,我们知道swapon系统调用有如下一段代码:
swapon系统调用:swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);if (IS_ERR(swap_file)) {error = PTR_ERR(swap_file);swap_file = NULL;goto bad_swap;}p->swap_file = swap_file;mapping = swap_file->f_mapping;dentry = swap_file->f_path.dentry;inode = mapping->host;static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
{int error;if (S_ISBLK(inode->i_mode)) {p->bdev = blkdev_get_by_dev(inode->i_rdev,FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);if (IS_ERR(p->bdev)) {error = PTR_ERR(p->bdev);p->bdev = NULL;return error;}p->old_block_size = block_size(p->bdev);error = set_blocksize(p->bdev, PAGE_SIZE);if (error < 0)return error;/** Zoned block devices contain zones that have a sequential* write only restriction. Hence zoned block devices are not* suitable for swapping. Disallow them here.*/if (blk_queue_is_zoned(p->bdev->bd_disk->queue))return -EINVAL;p->flags |= SWP_BLKDEV;} else if (S_ISREG(inode->i_mode)) {p->bdev = inode->i_sb->s_bdev;}return 0;
}
blkdev_get_by_dev函数返回了一个block_device,这个block_device跟前面zram块驱动blk_alloc_disk 生成的block_device有啥关系?就是同一个,我们还是要从源码视角看懂这一切:
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{bool unblock_events = true;struct block_device *bdev;struct gendisk *disk;int ret;ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,MAJOR(dev), MINOR(dev),((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));if (ret)return ERR_PTR(ret);bdev = blkdev_get_no_open(dev);if (!bdev)return ERR_PTR(-ENXIO);disk = bdev->bd_disk;...return bdev;
}
struct block_device *blkdev_get_no_open(dev_t dev)
{struct block_device *bdev;struct inode *inode;inode = ilookup(blockdev_superblock, dev);if (!inode) {blk_request_module(dev);inode = ilookup(blockdev_superblock, dev);if (!inode)return NULL;}/* switch from the inode reference to a device mode one: */bdev = &BDEV_I(inode)->bdev;if (!kobject_get_unless_zero(&bdev->bd_device.kobj))bdev = NULL;iput(inode);if (!bdev)return NULL;if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||!try_module_get(bdev->bd_disk->fops->owner)) {put_device(&bdev->bd_device);return NULL;}return bdev;
}
inode = ilookup(blockdev_superblock, dev);根据块设备号dev,从blockdev_superblock拿到inode节点,为什么这里能拿到块设备文件(/dev/block/zram0)的inode,就是因为bdev_add时候将inode对象加入到blockdev_superblock的inode hash表中了,这里就能拿到。
参考文章:
块设备剖析之关键数据结构分析 - block_device/gendisk/hd_struct-下雨夜-ChinaUnix博客
这篇关于Linux block_device gendisk和hd_struct到底是个啥关系的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!