f2fs write_checkpoint 过程分析

本文主要是介绍f2fs write_checkpoint 过程分析，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

write_checkpoint 主要负责把 cache中dirty的数据写回到磁盘中，在gc, trim, discard或者recovery的时候都会调用到。

int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);unsigned long long ckpt_ver;int err = 0;mutex_lock(&sbi->cp_mutex);if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))goto out;if (unlikely(f2fs_cp_error(sbi))) {err = -EIO;goto out;}if (f2fs_readonly(sbi->sb)) {err = -EROFS;goto out;}

先看一下，传入函数的参数有两个， f2fs_sb_info *sbi，与cp_control * cpc:

1) f2fs_sb_info * sbi: f2fs super block

2) struct cp_control *cpc: check point 控制结构体, 里面有对check point操作的参数，cp_reason值可以为CP_RECOVERY, CP_DISCARD, CP_TRIMMED, CP_SYNC, CP_UMOUNT等，表示在何种场景进行的check point操作。

上段代码，首先判断，如果checkpoint未dirty，但是cp_reason为cp_fastboot，或者为cp_sync，或者cp_reason为cp_discard，但是discard blocks个数为0，直接退出，不做任何操作。

接下来判断，check point是否有错误，如果有直接退出，f2fs是否为只读的，如果是直接退出。

	err = block_operations(sbi);if (err)goto out;

block_operations函数作用是将所有将所有FS操作都冻结住，为了做checkpoint（Freeze all the FS-operations for checkpoint), 我们看看具体是怎样冻结住的。

static int block_operations(struct f2fs_sb_info *sbi)
{struct writeback_control wbc = {.sync_mode = WB_SYNC_ALL,.nr_to_write = LONG_MAX,.for_reclaim = 0,};struct blk_plug plug;int err = 0;blk_start_plug(&plug);retry_flush_dents:f2fs_lock_all(sbi);/* write all the dirty dentry pages */if (get_pages(sbi, F2FS_DIRTY_DENTS)) {f2fs_unlock_all(sbi);err = sync_dirty_inodes(sbi, DIR_INODE);if (err)goto out;cond_resched();goto retry_flush_dents;}

首先将所有dentry相关的ditry pages同步写回，这个写回过程要先进行f2fs_lock_all(sbi)操作，我们发现，此过程结束的条件是无F2FS_DIRTY_DENTS, 但是结束时并没有释放锁，即没有f2fs_unlock_all(sbi).

/** POR: we should ensure that there are no dirty node pages* until finishing nat/sit flush. inode->i_blocks can be updated.*/down_write(&sbi->node_change);if (get_pages(sbi, F2FS_DIRTY_IMETA)) {up_write(&sbi->node_change);f2fs_unlock_all(sbi);err = f2fs_sync_inode_meta(sbi);if (err)goto out;cond_resched();goto retry_flush_dents;}

接下来，又对所有的dirty inode pages进行sync写回操作，同样的过程，最后退出时也没有进行f2fs_unlock_all(sbi),执行到这里，还占据着f2fs_lock_all锁。

retry_flush_nodes:down_write(&sbi->node_write);if (get_pages(sbi, F2FS_DIRTY_NODES)) {up_write(&sbi->node_write);err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);if (err) {up_write(&sbi->node_change);f2fs_unlock_all(sbi);goto out;}cond_resched();goto retry_flush_nodes;}

最后，对所有的dirty node pages做sync操作，执行到最后，占据着两个锁, 一个是f2fs_lock_all锁，一个是node_write锁。

static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{down_write(&sbi->cp_rwsem);
}

f2fs_lock_all操作的是sbi->cp_rwsem，所有fs相关的操作，都需要先获得这个信号量，对node block的操作也要获得node_write信号量，如果这两个在此时没有被释放，则其它的路径无法进行相关的操作，这就实现了block的功能。

	/* this is the case of multiple fstrims without any changes */if (cpc->reason & CP_DISCARD) {if (!exist_trim_candidates(sbi, cpc)) {unblock_operations(sbi);goto out;}if (NM_I(sbi)->dirty_nat_cnt == 0 &&SIT_I(sbi)->dirty_sentries == 0 &&prefree_segments(sbi) == 0) {flush_sit_entries(sbi, cpc);clear_prefree_segments(sbi, cpc);unblock_operations(sbi);goto out;}}

回到f2fs write_checkpoint过程，blockoperation之后，判断cpc_reason是否为CP_DISCARD（是否执行trim操作），如果是的话，判断是否有trim candidates，如果没有，则unlock_operations，即把f2fs_lock_all以及node_write信号量释放，退出。如果有trim candidates, 则判断如果dirty_nat_cnt，dirty_sentries,prefree_segment都为0的话，执行flush_sit_entries并释放信号量，后面详细描述flush_sit_entries.

	 * update checkpoint pack index* Increase the version number so that* SIT entries and seg summaries are written at correct place*/ckpt_ver = cur_cp_version(ckpt);ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);

check point version ++

	/* write cached NAT/SIT entries to NAT/SIT area */flush_nat_entries(sbi, cpc);

接下来是一个重要的函数，flush_at_entries，将cache中的所有nat/sit entries写入f2fs nat/sit area,我们看一下具体流程。

void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{struct f2fs_nm_info *nm_i = NM_I(sbi);struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);struct f2fs_journal *journal = curseg->journal;struct nat_entry_set *setvec[SETVEC_SIZE];struct nat_entry_set *set, *tmp;unsigned int found;nid_t set_idx = 0;LIST_HEAD(sets);if (!nm_i->dirty_nat_cnt)return;down_write(&nm_i->nat_tree_lock);/** if there are no enough space in journal to store dirty nat* entries, remove all entries from journal and merge them* into nat entry set.*/if (enabled_nat_bits(sbi, cpc) ||!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))remove_nats_in_journal(sbi);

flush_nat_entries首先判断，如果journal中没有足够的space来存储dirty nat entries，则将journal中所有的entries删除并将他们merge到nat entry set中。

static inline bool __has_cursum_space(struct f2fs_journal *journal,int size, int type)
{if (type == NAT_JOURNAL)return size <= MAX_NAT_JENTRIES(journal);return size <= MAX_SIT_JENTRIES(journal);
}

__has_cursum_space函数判断journal空闲space是否大于dirty_nat_cnt，如果小于, 则调用remve_nats_in_journal，将journal中的所有nat entries删除。看下remove_nats_in_journal函数。

static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{struct f2fs_nm_info *nm_i = NM_I(sbi);struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);struct f2fs_journal *journal = curseg->journal;int i;down_write(&curseg->journal_rwsem);//遍历journal中所有的nat entriesfor (i = 0; i < nats_in_cursum(journal); i++) {struct nat_entry *ne;struct f2fs_nat_entry raw_ne;nid_t nid = le32_to_cpu(nid_in_journal(journal, i));//得到journal中的f2fs_nat_entry结构raw_ne = nat_in_journal(journal, i);//判断nat cache中是否包含此nid nat entry数据ne = __lookup_nat_cache(nm_i, nid);if (!ne) {//如果nat cache中不包含此nid相关数据 ， 则新申请nat entry结构ne = __alloc_nat_entry(nid, true);//将新申请的nat entry结构加入nat_root缓存中//并将新申请的nat entry中入nat_entries__init_nat_entry(nm_i, ne, &raw_ne, true);}/** if a free nat in journal has not been used after last* checkpoint, we should remove it from available nids,* since later we will add it again.*/if (!get_nat_flag(ne, IS_DIRTY) &&le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {spin_lock(&nm_i->nid_list_lock);nm_i->available_nids--;spin_unlock(&nm_i->nid_list_lock);}__set_nat_cache_dirty(nm_i, ne);}update_nats_in_cursum(journal, -i);up_write(&curseg->journal_rwsem);
}

remove_nats_in_journal()进行删除journal中的nat entries操作，它遍历journal中的每一个nat entriy, 对每一个nat entry执行__set_nat_cache_dirty(nm_i, ne)，具体的删除操作也是由此函数完成的，看__set_nat_cache_dirty做了哪些事情。

static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,struct nat_entry *ne)
{nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);struct nat_entry_set *head;//首先在nat_set_root缓存中查找是否包含此sethead = radix_tree_lookup(&nm_i->nat_set_root, set);if (!head) {//如果不包含，则新申请一个nat_entry_set结构head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);//初始化新申请的nat_entry_setINIT_LIST_HEAD(&head->entry_list);INIT_LIST_HEAD(&head->set_list);head->set = set;head->entry_cnt = 0;//将新申请的nat_entry_set插入radix树缓存中f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);}if (get_nat_flag(ne, IS_DIRTY))goto refresh_list;nm_i->dirty_nat_cnt++;head->entry_cnt++;set_nat_flag(ne, IS_DIRTY, true);
refresh_list://if (nat_get_blkaddr(ne) == NEW_ADDR)list_del_init(&ne->list);elselist_move_tail(&ne->list, &head->entry_list);
}

__set_nat_cache_dirty主要做的事情，在nat_set_root tree中查找，是否包含相应的set，如果不包含，则新申请一个nat_entry_set，初始化并加入nat_set_root tree中。最后将此entry从原来的链表中删除，并移动到新申请的nat_entry_set链表中。经过这个操作后，journal 中所有的nat entries都移动到了nat_set_root 树中，并且具有相同nid的nat entry，链接到相同的nat_entry_set中（这里面有一处，如果nat enry的 block address 地址为NEW_ADDR，则只是将其从原来的list中删除，说明此nat entry没有有效的磁盘存储空间，也就不需要进行后续的flush操作？）。

从journal中删除所有的nat entries后，所有的nat entry都移到了nat set中，接下来有一个排序的过程，按照每个nat set中包含的nat entry数量的多少，时行排序，最后都存储到sets中。

接下来回到flush_nat_entries中，此函数最后的操作，就是把遍历所有的entry set, 把每个entry set中的的dirty nat entries flush, 写回磁盘中，具体看一下操作步骤。


static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,struct nat_entry_set *set, struct cp_control *cpc)
{struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);struct f2fs_journal *journal = curseg->journal;nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;bool to_journal = true;struct f2fs_nat_block *nat_blk;struct nat_entry *ne, *cur;struct page *page = NULL;/** there are two steps to flush nat entries:* #1, flush nat entries to journal in current hot data summary block.* #2, flush nat entries to nat page.*/if (enabled_nat_bits(sbi, cpc) ||!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))to_journal = false;

注释中描述，flush nat entries有两个步骤:

1) 将nat entries flush 到当前的hot data summary block journal中

2) 将nat entries flush到nat page中。

判断journal中是否有足够的free space，如果有，to_journal=true, 否则，to_journal=false。

	if (to_journal) {down_write(&curseg->journal_rwsem);} else {page = get_next_nat_page(sbi, start_nid);nat_blk = page_address(page);f2fs_bug_on(sbi, !nat_blk);}

如果to_journal=true，则后面会将nat set entries写到journal中，所以此时获取journal_rwsem锁，如果to_journal=false，则需要得到nat cache中的空间，将nat set entries写入到nat cache page中。得到nat cache 中的Page是通过get_next_nat_page得到的，看一下这个函数：


static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{struct page *src_page;struct page *dst_page;pgoff_t src_off;pgoff_t dst_off;void *src_addr;void *dst_addr;struct f2fs_nm_info *nm_i = NM_I(sbi);//得到当前nid对应的 nat cache page 偏移地址src_off = current_nat_addr(sbi, nid);//得到要写入的nat cache page address 偏移地址dst_off = next_nat_addr(sbi, src_off);/* get current nat block page with lock *///得到当前nat cache pagesrc_page = get_meta_page(sbi, src_off);//得到要写入的nat cache pagedst_page = grab_meta_page(sbi, dst_off);f2fs_bug_on(sbi, PageDirty(src_page));src_addr = page_address(src_page);dst_addr = page_address(dst_page);//将当前page中的内容拷贝到目的page中，并设置目的page为dirty//当前page执行put操作，如果索引为0时，则可以进行释放memcpy(dst_addr, src_addr, PAGE_SIZE);set_page_dirty(dst_page);f2fs_put_page(src_page, 1);//将nat_bitmap中索引设置为目的page，这样再查找时会返回目的pageset_to_next_nat(nm_i, nid);return dst_page;
}

f2fs为了防止元数据丢失，SIT area及NAT area的数据都包含两份，从f2fs format过程可以看到，两份数据中，一个保存的数据是最新的，get_next_nat_page目的就是得到另一个副本中相应的nat page，做为下一步写入的page，同时会更新nat bitmap。

/* flush dirty nats in nat entry set */list_for_each_entry_safe(ne, cur, &set->entry_list, list) {struct f2fs_nat_entry *raw_ne;nid_t nid = nat_get_nid(ne);int offset;f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);if (to_journal) {offset = lookup_journal_in_cursum(journal,NAT_JOURNAL, nid, 1);f2fs_bug_on(sbi, offset < 0);raw_ne = &nat_in_journal(journal, offset);nid_in_journal(journal, offset) = cpu_to_le32(nid);} else {raw_ne = &nat_blk->entries[nid - start_nid];}raw_nat_from_node_info(raw_ne, &ne->ni);nat_reset_flag(ne);__clear_nat_cache_dirty(NM_I(sbi), set, ne);if (nat_get_blkaddr(ne) == NULL_ADDR) {add_free_nid(sbi, nid, false, true);} else {spin_lock(&NM_I(sbi)->nid_list_lock);update_free_nid_bitmap(sbi, nid, false, false);spin_unlock(&NM_I(sbi)->nid_list_lock);}}

上面这一段代码，如果to_journal=true，则将nat_entry内容写入到journal中，如果to_journal=false，则将nat_entry内容写入一得到的nat cache page中。并设置相应的flag。

	if (to_journal) {up_write(&curseg->journal_rwsem);} else {__update_nat_bits(sbi, start_nid, page);f2fs_put_page(page, 1);}

最后，如果to_journal=true，释放journal_rwsem，说明已写完，如果to_journal=false, 则f2fs_pu_page（1），如果page索引为0，可以真正的写回此page到磁盘。

到这里，f2fs flush_nat_entries流程就结束了，它的主要作用就是将nat_set_root中所有的nat_set中的entries执行flush写回操作。

这篇关于f2fs write_checkpoint 过程分析的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！

f2fs write_checkpoint 过程分析

相关文章

MySQL中的LENGTH()函数用法详解与实例分析

Android kotlin中 Channel 和 Flow 的区别和选择使用场景分析

怎样通过分析GC日志来定位Java进程的内存问题

Java进程异常故障定位及排查过程

SpringBoot整合liteflow的详细过程

Java中调用数据库存储过程的示例代码

MySQL中的InnoDB单表访问过程

MySQL中的表连接原理分析

浏览器插件cursor实现自动注册、续杯的详细过程

Navicat数据表的数据添加,删除及使用sql完成数据的添加过程