[PATCH 3/5] dm: relax ordering of bio-based flush implementation

[PATCH 3/5] dm: relax ordering of bio-based flush implementation

am 30.08.2010 11:58:14 von Tejun Heo

Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering
against other bio's. This patch relaxes ordering around flushes.

* A flush bio is no longer deferred to workqueue directly. It's
processed like other bio's but __split_and_process_bio() uses
md->flush_bio as the clone source. md->flush_bio is initialized to
empty flush during md initialization and shared for all flushes.

* When dec_pending() detects that a flush has completed, it checks
whether the original bio has data. If so, the bio is queued to the
deferred list w/ REQ_FLUSH cleared; otherwise, it's completed.

* As flush sequencing is handled in the usual issue/completion path,
dm_wq_work() no longer needs to handle flushes differently. Now its
only responsibility is re-issuing deferred bio's the same way as
_dm_request() would. REQ_FLUSH handling logic including
process_flush() is dropped.

* There's no reason for queue_io() and dm_wq_work() write lock
dm->io_lock. queue_io() now only uses md->deferred_lock and
dm_wq_work() read locks dm->io_lock.

* bio's no longer need to be queued on the deferred list while a flush
is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it.

This avoids stalling the device during flushes and simplifies the
implementation.

Signed-off-by: Tejun Heo
---
drivers/md/dm.c | 157 ++++++++++++++++---------------------------------------
1 files changed, 45 insertions(+), 112 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 32e6622..e67c519 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_QUEUE_IO_TO_THREAD 6

/*
* Work processed by per-device workqueue.
@@ -144,11 +143,6 @@ struct mapped_device {
spinlock_t deferred_lock;

/*
- * An error from the flush request currently being processed.
- */
- int flush_error;
-
- /*
* Protect barrier_error from concurrent endio processing
* in request-based dm.
*/
@@ -529,16 +523,10 @@ static void end_io_acct(struct dm_io *io)
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
- down_write(&md->io_lock);
-
spin_lock_irq(&md->deferred_lock);
bio_list_add(&md->deferred, bio);
spin_unlock_irq(&md->deferred_lock);
-
- if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
- queue_work(md->wq, &md->work);
-
- up_write(&md->io_lock);
+ queue_work(md->wq, &md->work);
}

/*
@@ -626,11 +614,9 @@ static void dec_pending(struct dm_io *io, int error)
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md)) {
- if (!(io->bio->bi_rw & REQ_FLUSH))
- bio_list_add_head(&md->deferred,
- io->bio);
- } else
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
+ else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -638,26 +624,22 @@ static void dec_pending(struct dm_io *io, int error)

io_error = io->error;
bio = io->bio;
+ end_io_acct(io);
+ free_io(md, io);
+
+ if (io_error == DM_ENDIO_REQUEUE)
+ return;

- if (bio->bi_rw & REQ_FLUSH) {
+ if (!(bio->bi_rw & REQ_FLUSH) || !bio->bi_size) {
+ trace_block_bio_complete(md->queue, bio);
+ bio_endio(bio, io_error);
+ } else {
/*
- * There can be just one flush request so we use
- * a per-device variable for error reporting.
- * Note that you can't touch the bio after end_io_acct
+ * Preflush done for flush with data, reissue
+ * without REQ_FLUSH.
*/
- if (!md->flush_error)
- md->flush_error = io_error;
- end_io_acct(io);
- free_io(md, io);
- } else {
- end_io_acct(io);
- free_io(md, io);
-
- if (io_error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(md->queue, bio);
-
- bio_endio(bio, io_error);
- }
+ bio->bi_rw &= ~REQ_FLUSH;
+ queue_io(md, bio);
}
}
}
@@ -1369,21 +1351,17 @@ static int __clone_and_map(struct clone_info *ci)
*/
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
+ bool is_flush = bio->bi_rw & REQ_FLUSH;
struct clone_info ci;
int error = 0;

ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
- if (!(bio->bi_rw & REQ_FLUSH))
- bio_io_error(bio);
- else
- if (!md->flush_error)
- md->flush_error = -EIO;
+ bio_io_error(bio);
return;
}

ci.md = md;
- ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
@@ -1391,18 +1369,19 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_sector;
- if (!(bio->bi_rw & REQ_FLUSH))
+ ci.idx = bio->bi_idx;
+
+ if (!is_flush) {
+ ci.bio = bio;
ci.sector_count = bio_sectors(bio);
- else {
- /* all FLUSH bio's reaching here should be empty */
- WARN_ON_ONCE(bio_has_data(bio));
+ } else {
+ ci.bio = &ci.md->flush_bio;
ci.sector_count = 1;
}
- ci.idx = bio->bi_idx;

start_io_acct(ci.io);
while (ci.sector_count && !error) {
- if (!(bio->bi_rw & REQ_FLUSH))
+ if (!is_flush)
error = __clone_and_map(&ci);
else
error = __clone_and_map_flush(&ci);
@@ -1490,22 +1469,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();

- /*
- * If we're suspended or the thread is processing flushes
- * we have to queue this io for later.
- */
- if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
- (bio->bi_rw & REQ_FLUSH)) {
+ /* if we're suspended, we have to queue this io for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
up_read(&md->io_lock);

- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
- bio_rw(bio) == READA) {
+ if (bio_rw(bio) != READA)
+ queue_io(md, bio);
+ else
bio_io_error(bio);
- return 0;
- }
-
- queue_io(md, bio);
-
return 0;
}

@@ -2015,6 +1986,10 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad_bdev;

+ bio_init(&md->flush_bio);
+ md->flush_bio.bi_bdev = md->bdev;
+ md->flush_bio.bi_rw = WRITE_FLUSH;
+
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
@@ -2407,37 +2382,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
return r;
}

-static void process_flush(struct mapped_device *md, struct bio *bio)
-{
- md->flush_error = 0;
-
- /* handle REQ_FLUSH */
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
- bio_init(&md->flush_bio);
- md->flush_bio.bi_bdev = md->bdev;
- md->flush_bio.bi_rw = WRITE_FLUSH;
- __split_and_process_bio(md, &md->flush_bio);
-
- dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
- /* if it's an empty flush or the preflush failed, we're done */
- if (!bio_has_data(bio) || md->flush_error) {
- if (md->flush_error != DM_ENDIO_REQUEUE)
- bio_endio(bio, md->flush_error);
- else {
- spin_lock_irq(&md->deferred_lock);
- bio_list_add_head(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
- }
- return;
- }
-
- /* issue data + REQ_FUA */
- bio->bi_rw &= ~REQ_FLUSH;
- __split_and_process_bio(md, bio);
-}
-
/*
* Process the deferred bios
*/
@@ -2447,33 +2391,27 @@ static void dm_wq_work(struct work_struct *work)
work);
struct bio *c;

- down_write(&md->io_lock);
+ down_read(&md->io_lock);

while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
c = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);

- if (!c) {
- clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ if (!c)
break;
- }

- up_write(&md->io_lock);
+ up_read(&md->io_lock);

if (dm_request_based(md))
generic_make_request(c);
- else {
- if (c->bi_rw & REQ_FLUSH)
- process_flush(md, c);
- else
- __split_and_process_bio(md, c);
- }
+ else
+ __split_and_process_bio(md, c);

- down_write(&md->io_lock);
+ down_read(&md->io_lock);
}

- up_write(&md->io_lock);
+ up_read(&md->io_lock);
}

static void dm_queue_flush(struct mapped_device *md)
@@ -2672,17 +2610,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
*
* To get all processes out of __split_and_process_bio in dm_request,
* we take the write lock. To prevent any process from reentering
- * __split_and_process_bio from dm_request, we set
- * DMF_QUEUE_IO_TO_THREAD.
- *
- * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
- * and call flush_workqueue(md->wq). flush_workqueue will wait until
- * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
- * further calls to __split_and_process_bio from dm_wq_work.
+ * __split_and_process_bio from dm_request and quiesce the thread
+ * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+ * flush_workqueue(md->wq).
*/
down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);

/*
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/5] dm: relax ordering of bio-based flush

am 01.09.2010 15:51:21 von Mike Snitzer

On Mon, Aug 30 2010 at 5:58am -0400,
Tejun Heo wrote:

> Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering
> against other bio's. This patch relaxes ordering around flushes.
>
> * A flush bio is no longer deferred to workqueue directly. It's
> processed like other bio's but __split_and_process_bio() uses
> md->flush_bio as the clone source. md->flush_bio is initialized to
> empty flush during md initialization and shared for all flushes.
>
> * When dec_pending() detects that a flush has completed, it checks
> whether the original bio has data. If so, the bio is queued to the
> deferred list w/ REQ_FLUSH cleared; otherwise, it's completed.
>
> * As flush sequencing is handled in the usual issue/completion path,
> dm_wq_work() no longer needs to handle flushes differently. Now its
> only responsibility is re-issuing deferred bio's the same way as
> _dm_request() would. REQ_FLUSH handling logic including
> process_flush() is dropped.
>
> * There's no reason for queue_io() and dm_wq_work() write lock
> dm->io_lock. queue_io() now only uses md->deferred_lock and
> dm_wq_work() read locks dm->io_lock.
>
> * bio's no longer need to be queued on the deferred list while a flush
> is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it.
>
> This avoids stalling the device during flushes and simplifies the
> implementation.
>
> Signed-off-by: Tejun Heo

Looks good overall.

> @@ -144,11 +143,6 @@ struct mapped_device {
> spinlock_t deferred_lock;
>
> /*
> - * An error from the flush request currently being processed.
> - */
> - int flush_error;
> -
> - /*
> * Protect barrier_error from concurrent endio processing
> * in request-based dm.
> */

Could you please document why it is OK to remove 'flush_error' in the
patch header? The -EOPNOTSUPP handling removal (done in patch 2)
obviously helps enable this but it is not clear how the
'num_flush_requests' flushes that __clone_and_map_flush() generates do
not need explicit DM error handling.

Other than that.

Acked-by: Mike Snitzer

Re: [PATCH 3/5] dm: relax ordering of bio-based flush

am 01.09.2010 15:56:57 von Tejun Heo

On 09/01/2010 03:51 PM, Mike Snitzer wrote:
> Could you please document why it is OK to remove 'flush_error' in the
> patch header? The -EOPNOTSUPP handling removal (done in patch 2)
> obviously helps enable this but it is not clear how the
> 'num_flush_requests' flushes that __clone_and_map_flush() generates do
> not need explicit DM error handling.

Sure, I'll. It's because it now uses the same error handling path in
dec_pending() all other bio's use. The flush_error thing was there
because flushes got executed/completed in a separate code path to
begin with. With the special path gone, there's no need for
flush_error path either.

Thanks.

--
tejun

Re: [PATCH 3/5] dm: relax ordering of bio-based flush implementation

am 03.09.2010 08:04:36 von Kiyoshi Ueda

Hi Tejun,

On 08/30/2010 06:58 PM +0900, Tejun Heo wrote:
> Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering
> against other bio's. This patch relaxes ordering around flushes.
....
> * When dec_pending() detects that a flush has completed, it checks
> whether the original bio has data. If so, the bio is queued to the
> deferred list w/ REQ_FLUSH cleared; otherwise, it's completed.
....
> @@ -529,16 +523,10 @@ static void end_io_acct(struct dm_io *io)
> */
> static void queue_io(struct mapped_device *md, struct bio *bio)
> {
> - down_write(&md->io_lock);
> -
> spin_lock_irq(&md->deferred_lock);
> bio_list_add(&md->deferred, bio);
> spin_unlock_irq(&md->deferred_lock);
> -
> - if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
> - queue_work(md->wq, &md->work);
> -
> - up_write(&md->io_lock);
> + queue_work(md->wq, &md->work);
....
> @@ -638,26 +624,22 @@ static void dec_pending(struct dm_io *io, int error)
....
> - } else {
> - end_io_acct(io);
> - free_io(md, io);
> -
> - if (io_error != DM_ENDIO_REQUEUE) {
> - trace_block_bio_complete(md->queue, bio);
> -
> - bio_endio(bio, io_error);
> - }
> + bio->bi_rw &= ~REQ_FLUSH;
> + queue_io(md, bio);

dec_pending() is a function which is called during I/O completion
where the caller may be disabling interrupts.
So if you use queue_io() inside dec_pending(), the spin_lock must be
taken/released with irqsave/irqrestore like the patch below.

BTW, lockdep detects the issue and a warning like below is displayed.
It may break the underlying drivers.

=================================
[ INFO: inconsistent lock state ]
2.6.36-rc2+ #2
---------------------------------
inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
kworker/0:1/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
(&(&q->__queue_lock)->rlock){?.-...}, at: [] blk_end_bidi_request+0x44/0x80
{IN-HARDIRQ-W} state was registered at:
[] __lock_acquire+0x8c6/0xb30
[] lock_acquire+0xa0/0x120
[] _raw_spin_lock_irqsave+0x4e/0x70
[] ata_qc_schedule_eh+0x5a/0xa0 [libata]
[] ata_qc_complete+0x147/0x1f0 [libata]
[] ata_hsm_qc_complete+0xc2/0x140 [libata]
[] ata_sff_hsm_move+0x1d5/0x700 [libata]
[] __ata_sff_port_intr+0xb3/0x100 [libata]
[] ata_bmdma_port_intr+0x3f/0x120 [libata]
[] ata_bmdma_interrupt+0x195/0x1e0 [libata]
[] handle_IRQ_event+0x54/0x170
[] handle_edge_irq+0xc8/0x170
[] handle_irq+0x4b/0xa0
[] do_IRQ+0x6f/0xf0
[] ret_from_intr+0x0/0x16
[] _raw_spin_unlock+0x23/0x40
[] sys_dup3+0x122/0x1a0
[] sys_dup2+0x23/0xb0
[] system_call_fastpath+0x16/0x1b
irq event stamp: 14660913
hardirqs last enabled at (14660912): [] _raw_spin_unlock_irqrestore+0x65/0x80
hardirqs last disabled at (14660913): [] _raw_spin_lock_irqsave+0x2e/0x70
softirqs last enabled at (14660874): [] __do_softirq+0x14e/0x210
softirqs last disabled at (14660879): [] call_softirq+0x1c/0x50

other info that might help us debug this:
1 lock held by kworker/0:1/0:
#0: (&(&q->__queue_lock)->rlock){?.-...}, at: [] blk_end_bidi_request+0x44/0x80

stack backtrace:
Pid: 0, comm: kworker/0:1 Not tainted 2.6.36-rc2+ #2
Call Trace:
[] print_usage_bug+0x1a6/0x1f0
[] mark_lock+0x661/0x690
[] ? check_usage_backwards+0x0/0xf0
[] mark_held_locks+0x60/0x80
[] ? _raw_spin_unlock_irq+0x30/0x40
[] trace_hardirqs_on_caller+0x83/0x1a0
[] trace_hardirqs_on+0xd/0x10
[] _raw_spin_unlock_irq+0x30/0x40
[] ? queue_io+0x2e/0x90 [dm_mod]
[] queue_io+0x57/0x90 [dm_mod]
[] dec_pending+0x22a/0x320 [dm_mod]
[] ? dec_pending+0x55/0x320 [dm_mod]
[] clone_endio+0xad/0xc0 [dm_mod]
[] bio_endio+0x1d/0x40
[] req_bio_endio+0x81/0xf0
[] blk_update_request+0x23d/0x460
[] ? blk_update_request+0x116/0x460
[] blk_update_bidi_request+0x27/0x80
[] __blk_end_bidi_request+0x20/0x50
[] __blk_end_request_all+0x1f/0x40
[] blk_flush_complete_seq+0x140/0x1a0
[] pre_flush_end_io+0x39/0x50
[] blk_finish_request+0x85/0x290
[] blk_end_bidi_request+0x52/0x80
[] blk_end_request_all+0x1f/0x40
[] dm_softirq_done+0xad/0x120 [dm_mod]
[] blk_done_softirq+0x86/0xa0
[] __do_softirq+0xd6/0x210
[] call_softirq+0x1c/0x50
[] do_softirq+0x95/0xd0
[] irq_exit+0x4d/0x60
[] do_IRQ+0x78/0xf0
[] ret_from_intr+0x0/0x16
[] ? mwait_idle+0x79/0xe0
[] ? mwait_idle+0x70/0xe0
[] cpu_idle+0x66/0xe0
[] ? start_secondary+0x181/0x1f0
[] start_secondary+0x18f/0x1f0

Thanks,
Kiyoshi Ueda


Now queue_io() is called from dec_pending(), which may be called with
interrupts disabled.
So queue_io() must not enable interrupts unconditionally and must
save/restore the current interrupts status.

Signed-off-by: Kiyoshi Ueda
Signed-off-by: Jun'ichi Nomura
---
drivers/md/dm.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

Index: misc/drivers/md/dm.c
============================================================ =======
--- misc.orig/drivers/md/dm.c
+++ misc/drivers/md/dm.c
@@ -512,9 +512,11 @@ static void end_io_acct(struct dm_io *io
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
- spin_lock_irq(&md->deferred_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&md->deferred_lock, flags);
bio_list_add(&md->deferred, bio);
- spin_unlock_irq(&md->deferred_lock);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
queue_work(md->wq, &md->work);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/5] dm: relax ordering of bio-based flush implementation

am 03.09.2010 11:42:53 von Tejun Heo

Hello,

On 09/03/2010 08:04 AM, Kiyoshi Ueda wrote:
> Now queue_io() is called from dec_pending(), which may be called with
> interrupts disabled.
> So queue_io() must not enable interrupts unconditionally and must
> save/restore the current interrupts status.
>
> Signed-off-by: Kiyoshi Ueda
> Signed-off-by: Jun'ichi Nomura

Patch included into the series. Thanks a lot!

--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html