writeback, cgroup: support switching multiple inodes at once
Currently only a single inode can be switched to another writeback structure at once. That means to switch an inode a separate inode_switch_wbs_context structure must be allocated, and a separate rcu callback and work must be scheduled. It's fine for the existing ad-hoc switching, which is not happening that often, but sub-optimal for massive switching required in order to release a writeback structure. To prepare for it, let's add a support for switching multiple inodes at once. Instead of containing a single inode pointer, inode_switch_wbs_context will contain a NULL-terminated array of inode pointers. inode_do_switch_wbs() will be called for each inode. To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's list_locks will be acquired and released only once altogether for all inodes. wb_wakeup() will be also be called only once. Instead of calling wb_put(old_wb) after each successful switch, wb_put_many() is introduced and used. Link: https://lkml.kernel.org/r/20210608230225.2078447-8-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Acked-by: Tejun Heo <tj@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Acked-by: Dennis Zhou <dennis@kernel.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Dave Chinner <dchinner@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
committed by
Linus Torvalds
parent
72d4512e9c
commit
f5fbe6b7ad
@@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct inode_switch_wbs_context {
|
struct inode_switch_wbs_context {
|
||||||
struct inode *inode;
|
|
||||||
struct bdi_writeback *new_wb;
|
|
||||||
|
|
||||||
struct rcu_work work;
|
struct rcu_work work;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Multiple inodes can be switched at once. The switching procedure
|
||||||
|
* consists of two parts, separated by a RCU grace period. To make
|
||||||
|
* sure that the second part is executed for each inode gone through
|
||||||
|
* the first part, all inode pointers are placed into a NULL-terminated
|
||||||
|
* array embedded into struct inode_switch_wbs_context. Otherwise
|
||||||
|
* an inode could be left in a non-consistent state.
|
||||||
|
*/
|
||||||
|
struct bdi_writeback *new_wb;
|
||||||
|
struct inode *inodes[];
|
||||||
};
|
};
|
||||||
|
|
||||||
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
|
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
|
||||||
@@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
|
|||||||
up_write(&bdi->wb_switch_rwsem);
|
up_write(&bdi->wb_switch_rwsem);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void inode_do_switch_wbs(struct inode *inode,
|
static bool inode_do_switch_wbs(struct inode *inode,
|
||||||
|
struct bdi_writeback *old_wb,
|
||||||
struct bdi_writeback *new_wb)
|
struct bdi_writeback *new_wb)
|
||||||
{
|
{
|
||||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
|
||||||
struct address_space *mapping = inode->i_mapping;
|
struct address_space *mapping = inode->i_mapping;
|
||||||
struct bdi_writeback *old_wb = inode->i_wb;
|
|
||||||
XA_STATE(xas, &mapping->i_pages, 0);
|
XA_STATE(xas, &mapping->i_pages, 0);
|
||||||
struct page *page;
|
struct page *page;
|
||||||
bool switched = false;
|
bool switched = false;
|
||||||
|
|
||||||
/*
|
|
||||||
* If @inode switches cgwb membership while sync_inodes_sb() is
|
|
||||||
* being issued, sync_inodes_sb() might miss it. Synchronize.
|
|
||||||
*/
|
|
||||||
down_read(&bdi->wb_switch_rwsem);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* By the time control reaches here, RCU grace period has passed
|
|
||||||
* since I_WB_SWITCH assertion and all wb stat update transactions
|
|
||||||
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
|
|
||||||
* synchronizing against the i_pages lock.
|
|
||||||
*
|
|
||||||
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
|
|
||||||
* gives us exclusion against all wb related operations on @inode
|
|
||||||
* including IO list manipulations and stat updates.
|
|
||||||
*/
|
|
||||||
if (old_wb < new_wb) {
|
|
||||||
spin_lock(&old_wb->list_lock);
|
|
||||||
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
|
|
||||||
} else {
|
|
||||||
spin_lock(&new_wb->list_lock);
|
|
||||||
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
|
|
||||||
}
|
|
||||||
spin_lock(&inode->i_lock);
|
spin_lock(&inode->i_lock);
|
||||||
xa_lock_irq(&mapping->i_pages);
|
xa_lock_irq(&mapping->i_pages);
|
||||||
|
|
||||||
@@ -458,25 +442,63 @@ skip_switch:
|
|||||||
|
|
||||||
xa_unlock_irq(&mapping->i_pages);
|
xa_unlock_irq(&mapping->i_pages);
|
||||||
spin_unlock(&inode->i_lock);
|
spin_unlock(&inode->i_lock);
|
||||||
spin_unlock(&new_wb->list_lock);
|
|
||||||
spin_unlock(&old_wb->list_lock);
|
|
||||||
|
|
||||||
up_read(&bdi->wb_switch_rwsem);
|
return switched;
|
||||||
|
|
||||||
if (switched) {
|
|
||||||
wb_wakeup(new_wb);
|
|
||||||
wb_put(old_wb);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void inode_switch_wbs_work_fn(struct work_struct *work)
|
static void inode_switch_wbs_work_fn(struct work_struct *work)
|
||||||
{
|
{
|
||||||
struct inode_switch_wbs_context *isw =
|
struct inode_switch_wbs_context *isw =
|
||||||
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
|
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
|
||||||
|
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
|
||||||
|
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
|
||||||
|
struct bdi_writeback *new_wb = isw->new_wb;
|
||||||
|
unsigned long nr_switched = 0;
|
||||||
|
struct inode **inodep;
|
||||||
|
|
||||||
inode_do_switch_wbs(isw->inode, isw->new_wb);
|
/*
|
||||||
wb_put(isw->new_wb);
|
* If @inode switches cgwb membership while sync_inodes_sb() is
|
||||||
iput(isw->inode);
|
* being issued, sync_inodes_sb() might miss it. Synchronize.
|
||||||
|
*/
|
||||||
|
down_read(&bdi->wb_switch_rwsem);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* By the time control reaches here, RCU grace period has passed
|
||||||
|
* since I_WB_SWITCH assertion and all wb stat update transactions
|
||||||
|
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
|
||||||
|
* synchronizing against the i_pages lock.
|
||||||
|
*
|
||||||
|
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
|
||||||
|
* gives us exclusion against all wb related operations on @inode
|
||||||
|
* including IO list manipulations and stat updates.
|
||||||
|
*/
|
||||||
|
if (old_wb < new_wb) {
|
||||||
|
spin_lock(&old_wb->list_lock);
|
||||||
|
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
|
||||||
|
} else {
|
||||||
|
spin_lock(&new_wb->list_lock);
|
||||||
|
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (inodep = isw->inodes; *inodep; inodep++) {
|
||||||
|
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
|
||||||
|
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
|
||||||
|
nr_switched++;
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_unlock(&new_wb->list_lock);
|
||||||
|
spin_unlock(&old_wb->list_lock);
|
||||||
|
|
||||||
|
up_read(&bdi->wb_switch_rwsem);
|
||||||
|
|
||||||
|
if (nr_switched) {
|
||||||
|
wb_wakeup(new_wb);
|
||||||
|
wb_put_many(old_wb, nr_switched);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (inodep = isw->inodes; *inodep; inodep++)
|
||||||
|
iput(*inodep);
|
||||||
|
wb_put(new_wb);
|
||||||
kfree(isw);
|
kfree(isw);
|
||||||
atomic_dec(&isw_nr_in_flight);
|
atomic_dec(&isw_nr_in_flight);
|
||||||
}
|
}
|
||||||
@@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||||||
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
|
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
|
isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
|
||||||
if (!isw)
|
if (!isw)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -530,7 +552,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||||||
__iget(inode);
|
__iget(inode);
|
||||||
spin_unlock(&inode->i_lock);
|
spin_unlock(&inode->i_lock);
|
||||||
|
|
||||||
isw->inode = inode;
|
isw->inodes[0] = inode;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In addition to synchronizing among switchers, I_WB_SWITCH tells
|
* In addition to synchronizing among switchers, I_WB_SWITCH tells
|
||||||
|
|||||||
@@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb)
|
|||||||
/**
|
/**
|
||||||
* wb_put - decrement a wb's refcount
|
* wb_put - decrement a wb's refcount
|
||||||
* @wb: bdi_writeback to put
|
* @wb: bdi_writeback to put
|
||||||
|
* @nr: number of references to put
|
||||||
*/
|
*/
|
||||||
static inline void wb_put(struct bdi_writeback *wb)
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
||||||
{
|
{
|
||||||
if (WARN_ON_ONCE(!wb->bdi)) {
|
if (WARN_ON_ONCE(!wb->bdi)) {
|
||||||
/*
|
/*
|
||||||
@@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (wb != &wb->bdi->wb)
|
if (wb != &wb->bdi->wb)
|
||||||
percpu_ref_put(&wb->refcnt);
|
percpu_ref_put_many(&wb->refcnt, nr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wb_put - decrement a wb's refcount
|
||||||
|
* @wb: bdi_writeback to put
|
||||||
|
*/
|
||||||
|
static inline void wb_put(struct bdi_writeback *wb)
|
||||||
|
{
|
||||||
|
wb_put_many(wb, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool wb_dying(struct bdi_writeback *wb)
|
static inline bool wb_dying(struct bdi_writeback *wb)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
Reference in New Issue
Block a user