Linux 3.8 Writeback机制源码分析

本文主要是介绍Linux 3.8 Writeback机制源码分析，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

writeback相关数据结构

与writeback相关的数据结构主要有：

backing_dev_info，该数据结构描述了backing_dev的所有信息，通常块设备的request queue中会包含backing_dev对象。
bdi_writeback，该数据结构封装了writeback的内核线程以及需要操作的inode队列。
wb_writeback_work，该数据结构封装了writeback的工作任务。

它们的结构体分别如下：

struct backing_dev_info {struct list_head bdi_list;unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */unsigned long state;	/* Always use atomic bitops on this */unsigned int capabilities; /* Device capabilities */congested_fn *congested_fn; /* Function pointer if device is md/dm */void *congested_data;	/* Pointer to aux data for congested func */char *name;struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];unsigned long bw_time_stamp;	/* last time write bw is updated */unsigned long dirtied_stamp;unsigned long written_stamp;	/* pages written at bw_time_stamp */unsigned long write_bandwidth;	/* the estimated write bandwidth */unsigned long avg_write_bandwidth; /* further smoothed write bw *//** The base dirty throttle rate, re-calculated on every 200ms.* All the bdi tasks' dirty rate will be curbed under it.* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit* in small steps and is much more smooth/stable than the latter.*/unsigned long dirty_ratelimit;unsigned long balanced_dirty_ratelimit;struct fprop_local_percpu completions;int dirty_exceeded;unsigned int min_ratio;unsigned int max_ratio, max_prop_frac;struct bdi_writeback wb;  /* default writeback info for this bdi */spinlock_t wb_lock;	  /* protects work_list */struct list_head work_list;struct device *dev;struct timer_list laptop_mode_wb_timer;#ifdef CONFIG_DEBUG_FSstruct dentry *debug_dir;struct dentry *debug_stats;
#endif
};

struct bdi_writeback {struct backing_dev_info *bdi;	/* our parent bdi */unsigned int nr;unsigned long last_old_flush;	/* last old data flush */unsigned long last_active;	/* last time bdi thread was active */struct task_struct *task;	/* writeback thread */struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */struct list_head b_dirty;	/* dirty inodes */struct list_head b_io;		/* parked for writeback */struct list_head b_more_io;	/* parked for more writeback */spinlock_t list_lock;		/* protects the b_* lists */
};

/** Passed into wb_writeback(), essentially a subset of writeback_control*/
struct wb_writeback_work {long nr_pages;struct super_block *sb;unsigned long *older_than_this;enum writeback_sync_modes sync_mode;unsigned int tagged_writepages:1;unsigned int for_kupdate:1;unsigned int range_cyclic:1;unsigned int for_background:1;enum wb_reason reason;		/* why was writeback initiated? */struct list_head list;		/* pending work list */struct completion *done;	/* set if the caller waits */
};

BDI数据结构是对块设备的一个描述。bdi对象在块设备添加的时候需要注册到系统的bdi队列中。对于ext3而言，在mount的时候需要将底层块设备的bdi对象联系到ext3 root_inode中。在bdi数据结构中有一条work_list，该队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理，那么writeback内核线程将会睡眠等待。
writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时，可以将该inode挂载到writeback对象的b_dirty队列上，然后唤醒writeback线程。在处理过程中，inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。
wb_writeback_work数据结构是对writeback任务的封装，不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空，那么内核线程就可以睡眠了。

writeback主要函数分析

writeback机制的主要函数包括如下两个方面：

管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。
writeback内核线程处理函数，实现dirty page的刷新操作

writeback线程管理

Linux中有一个内核守护线程，该线程用来管理系统bdi队列，并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候，bdi_forker_thread程序会为其分配线程资源；当一个writeback线程长时间（默认为5min）处于空闲状态时，bdi_forker_thread程序会释放该线程资源。

static int bdi_forker_thread(void *ptr)
{struct bdi_writeback *me = ptr;current->flags |= PF_SWAPWRITE;set_freezable();/** Our parent may run at a different priority, just set us to normal*/set_user_nice(current, 0);for (;;) {struct task_struct *task = NULL;struct backing_dev_info *bdi;enum {NO_ACTION,   /* Nothing to do */FORK_THREAD, /* Fork bdi thread */KILL_THREAD, /* Kill inactive bdi thread */} action = NO_ACTION;/** Temporary measure, we want to make sure we don't see* dirty data on the default backing_dev_info*/if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {del_timer(&me->wakeup_timer);wb_do_writeback(me, 0);}spin_lock_bh(&bdi_lock);/** In the following loop we are going to check whether we have* some work to do without any synchronization with tasks* waking us up to do work for them. Set the task state here* so that we don't miss wakeups after verifying conditions.*/set_current_state(TASK_INTERRUPTIBLE);list_for_each_entry(bdi, &bdi_list, bdi_list) {bool have_dirty_io;if (!bdi_cap_writeback_dirty(bdi) ||bdi_cap_flush_forker(bdi))continue;WARN(!test_bit(BDI_registered, &bdi->state),"bdi %p/%s is not registered!\n", bdi, bdi->name);have_dirty_io = !list_empty(&bdi->work_list) ||wb_has_dirty_io(&bdi->wb);/** If the bdi has work to do, but the thread does not* exist - create it.*/if (!bdi->wb.task && have_dirty_io) {/** Set the pending bit - if someone will try to* unregister this bdi - it'll wait on this bit.*/set_bit(BDI_pending, &bdi->state);action = FORK_THREAD;break;}spin_lock(&bdi->wb_lock);/** If there is no work to do and the bdi thread was* inactive long enough - kill it. The wb_lock is taken* to make sure no-one adds more work to this bdi and* wakes the bdi thread up.*/if (bdi->wb.task && !have_dirty_io &&time_after(jiffies, bdi->wb.last_active +bdi_longest_inactive())) {task = bdi->wb.task;bdi->wb.task = NULL;spin_unlock(&bdi->wb_lock);set_bit(BDI_pending, &bdi->state);action = KILL_THREAD;break;}spin_unlock(&bdi->wb_lock);}spin_unlock_bh(&bdi_lock);/* Keep working if default bdi still has things to do */if (!list_empty(&me->bdi->work_list))__set_current_state(TASK_RUNNING);switch (action) {case FORK_THREAD:__set_current_state(TASK_RUNNING);task = kthread_create(bdi_writeback_thread, &bdi->wb,"flush-%s", dev_name(bdi->dev));if (IS_ERR(task)) {/** If thread creation fails, force writeout of* the bdi from the thread. Hopefully 1024 is* large enough for efficient IO.*/writeback_inodes_wb(&bdi->wb, 1024,WB_REASON_FORKER_THREAD);} else {/** The spinlock makes sure we do not lose* wake-ups when racing with 'bdi_queue_work()'.* And as soon as the bdi thread is visible, we* can start it.*/spin_lock_bh(&bdi->wb_lock);bdi->wb.task = task;spin_unlock_bh(&bdi->wb_lock);wake_up_process(task);}bdi_clear_pending(bdi);break;case KILL_THREAD:__set_current_state(TASK_RUNNING);kthread_stop(task);bdi_clear_pending(bdi);break;case NO_ACTION:if (!wb_has_dirty_io(me) || !dirty_writeback_interval)/** There are no dirty data. The only thing we* should now care about is checking for* inactive bdi threads and killing them. Thus,* let's sleep for longer time, save energy and* be friendly for battery-driven devices.*/schedule_timeout(bdi_longest_inactive());elseschedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));try_to_freeze();break;}}return 0;
}

Writeback工作线程

writeback线程是bdi_forker_thread 创建的，该线程的任务就是处理等待的数据回刷任务。线程处理函数为bdi_writeback_thread，该函数的实现如下：

/** Handle writeback of dirty data for the device backed by this bdi. Also* wakes up periodically and does kupdated style flushing.*/
int bdi_writeback_thread(void *data)
{struct bdi_writeback *wb = data;struct backing_dev_info *bdi = wb->bdi;long pages_written;current->flags |= PF_SWAPWRITE;set_freezable();wb->last_active = jiffies;/** Our parent may run at a different priority, just set us to normal*/set_user_nice(current, 0);trace_writeback_thread_start(bdi);while (!kthread_freezable_should_stop(NULL)) {/** Remove own delayed wake-up timer, since we are already awake* and we'll take care of the periodic write-back.*/del_timer(&wb->wakeup_timer);pages_written = wb_do_writeback(wb, 0);trace_writeback_pages_written(pages_written);if (pages_written)wb->last_active = jiffies;set_current_state(TASK_INTERRUPTIBLE);if (!list_empty(&bdi->work_list) || kthread_should_stop()) {__set_current_state(TASK_RUNNING);continue;}if (wb_has_dirty_io(wb) && dirty_writeback_interval)schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));else {/** We have nothing to do, so can go sleep without any* timeout and save power. When a work is queued or* something is made dirty - we will be woken up.*/schedule();}}/* Flush any work that raced with us exiting */if (!list_empty(&bdi->work_list))wb_do_writeback(wb, 1);trace_writeback_thread_stop(bdi);return 0;
}

bdi_writeback_thread函数主要是调用wb_do_writeback()函数。

/** Retrieve work items and do the writeback they describe*/
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{struct backing_dev_info *bdi = wb->bdi;struct wb_writeback_work *work;long wrote = 0;set_bit(BDI_writeback_running, &wb->bdi->state);while ((work = get_next_work_item(bdi)) != NULL) {/** Override sync mode, in case we must wait for completion* because this thread is exiting now.*/if (force_wait)work->sync_mode = WB_SYNC_ALL;trace_writeback_exec(bdi, work);wrote += wb_writeback(wb, work);/** Notify the caller of completion if this is a synchronous* work item, otherwise just free it.*/if (work->done)complete(work->done);elsekfree(work);}/** Check for periodic writeback, kupdated() style*/wrote += wb_check_old_data_flush(wb);wrote += wb_check_background_flush(wb);clear_bit(BDI_writeback_running, &wb->bdi->state);return wrote;
}

wb_check_old_data_flush函数的主要功能是周期性的检查脏页并写回，它默认写回30s之前写入的脏页，每隔5s扫描一次。

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{unsigned long expired;long nr_pages;/** When set to zero, disable periodic writeback*/if (!dirty_writeback_interval)return 0;expired = wb->last_old_flush +msecs_to_jiffies(dirty_writeback_interval * 10);if (time_before(jiffies, expired))return 0;wb->last_old_flush = jiffies;nr_pages = get_nr_dirty_pages();if (nr_pages) {struct wb_writeback_work work = {.nr_pages	= nr_pages,.sync_mode	= WB_SYNC_NONE,.for_kupdate	= 1,.range_cyclic	= 1,.reason		= WB_REASON_PERIODIC,};return wb_writeback(wb, &work);}return 0;
}

wb_check_background_flush的功能是在脏页达到一定比例时写回所有的脏页，直到脏页的比例达到阀值以下。

static long wb_check_background_flush(struct bdi_writeback *wb)
{if (over_bground_thresh(wb->bdi)) {struct wb_writeback_work work = {.nr_pages	= LONG_MAX,.sync_mode	= WB_SYNC_NONE,.for_background	= 1,.range_cyclic	= 1,.reason		= WB_REASON_BACKGROUND,};return wb_writeback(wb, &work);}return 0;
}

wb_check_background_flush和wb_check_old_data_flush的函数只是设置wb_writeback_work的各项参数，然后执行wb_writeback函数，该函数是Writeback机制中真正执行写回的函数。Writeback机制中的写回磁盘操作都是通过wb_writeback函数实现的，wb_writeback调用与文件系统有关的write函数，执行协会磁盘的操作。

/** Explicit flushing or periodic writeback of "old" data.** Define "old": the first time one of an inode's pages is dirtied, we mark the* dirtying-time in the inode's address_space.  So this periodic writeback code* just walks the superblock inode list, writing back any inodes which are* older than a specific point in time.** Try to run once per dirty_writeback_interval.  But if a writeback event* takes longer than a dirty_writeback_interval interval, then leave a* one-second gap.** older_than_this takes precedence over nr_to_write.  So we'll only write back* all dirty pages if they are all attached to "old" mappings.*/
static long wb_writeback(struct bdi_writeback *wb,struct wb_writeback_work *work)
{unsigned long wb_start = jiffies;long nr_pages = work->nr_pages;unsigned long oldest_jif;struct inode *inode;long progress;oldest_jif = jiffies;work->older_than_this = &oldest_jif;spin_lock(&wb->list_lock);for (;;) {/** Stop writeback when nr_pages has been consumed*/if (work->nr_pages <= 0)break;/** Background writeout and kupdate-style writeback may* run forever. Stop them if there is other work to do* so that e.g. sync can proceed. They'll be restarted* after the other works are all done.*/if ((work->for_background || work->for_kupdate) &&!list_empty(&wb->bdi->work_list))break;/** For background writeout, stop when we are below the* background dirty threshold*/if (work->for_background && !over_bground_thresh(wb->bdi))break;/** Kupdate and background works are special and we want to* include all inodes that need writing. Livelock avoidance is* handled by these works yielding to any other work so we are* safe.*/if (work->for_kupdate) {oldest_jif = jiffies -msecs_to_jiffies(dirty_expire_interval * 10);} else if (work->for_background)oldest_jif = jiffies;trace_writeback_start(wb->bdi, work);if (list_empty(&wb->b_io))queue_io(wb, work);if (work->sb)progress = writeback_sb_inodes(work->sb, wb, work);elseprogress = __writeback_inodes_wb(wb, work);trace_writeback_written(wb->bdi, work);wb_update_bandwidth(wb, wb_start);/** Did we write something? Try for more** Dirty inodes are moved to b_io for writeback in batches.* The completion of the current batch does not necessarily* mean the overall work is done. So we keep looping as long* as made some progress on cleaning pages or inodes.*/if (progress)continue;/** No more inodes for IO, bail*/if (list_empty(&wb->b_more_io))break;/** Nothing written. Wait for some inode to* become available for writeback. Otherwise* we'll just busyloop.*/if (!list_empty(&wb->b_more_io))  {trace_writeback_wait(wb->bdi, work);inode = wb_inode(wb->b_more_io.prev);spin_lock(&inode->i_lock);spin_unlock(&wb->list_lock);/* This function drops i_lock... */inode_sleep_on_writeback(inode);spin_lock(&wb->list_lock);}}spin_unlock(&wb->list_lock);return nr_pages - work->nr_pages;
}