[PATCH/RFC] md/multipath: implement I/O balancing

[PATCH/RFC] md/multipath: implement I/O balancing

am 10.06.2011 13:32:11 von Namhyung Kim

Implement basic I/O balancing code (for read/write) for multipath
personality. The code is based on RAID1 implementation.

Signed-off-by: Namhyung Kim
---
drivers/md/multipath.c | 70 ++++++++++++++++++++++++++++++++++++++---------
drivers/md/multipath.h | 1 +
2 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23af288..83c4f5105705 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -30,29 +30,58 @@

#define NR_RESERVED_BUFS 32

-
-static int multipath_map (multipath_conf_t *conf)
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, IO completion handlers update this
+ * position correctly. We pick the disk whose head is closest.
+ *
+ * Note that 'sector' argument is for original bio whereas 'head_position'
+ * is maintained for each rdev so we should take it into account when
+ * calculating the distance.
+ */
+static int multipath_map(multipath_conf_t *conf, sector_t sector)
{
int i, disks = conf->raid_disks;
-
- /*
- * Later we do read balancing on the read side
- * now we use the first available disk.
- */
+ int best_disk;
+ sector_t best_dist;

rcu_read_lock();
+retry:
+ best_disk = -1;
+ best_dist = MaxSector;
+
for (i = 0; i < disks; i++) {
+ int dist;
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ sector_t this_sector = sector;
+
if (rdev && test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- return i;
+ this_sector += rdev->data_offset;
+ dist = abs(this_sector - conf->multipaths[i].head_position);
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_disk = i;
+ }
}
}
+
+ if (best_disk == -1) {
+ printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+ } else {
+ mdk_rdev_t *rdev;
+
+ rdev = rcu_dereference(conf->multipaths[best_disk].rdev);
+ if (!rdev || !test_bit(In_sync, &rdev->flags))
+ goto retry;
+
+ atomic_inc(&rdev->nr_pending);
+ }
rcu_read_unlock();

- printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
- return (-1);
+ return best_disk;
}

static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
@@ -82,6 +111,17 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
mempool_free(mp_bh, conf->pool);
}

+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, struct multipath_bh *mp_bh)
+{
+ multipath_conf_t *conf = mp_bh->mddev->private;
+
+ conf->multipaths[disk].head_position =
+ mp_bh->bio.bi_sector + (mp_bh->bio.bi_size >> 9);
+}
+
static void multipath_end_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -89,6 +129,8 @@ static void multipath_end_request(struct bio *bio, int error)
multipath_conf_t *conf = mp_bh->mddev->private;
mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;

+ update_head_pos(mp_bh->path, mp_bh);
+
if (uptodate)
multipath_end_bh_io(mp_bh, 0);
else if (!(bio->bi_rw & REQ_RAHEAD)) {
@@ -122,7 +164,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;

- mp_bh->path = multipath_map(conf);
+ mp_bh->path = multipath_map(conf, bio->bi_sector);
if (mp_bh->path < 0) {
bio_endio(bio, -EIO);
mempool_free(mp_bh, conf->pool);
@@ -356,7 +398,7 @@ static void multipathd (mddev_t *mddev)
bio = &mp_bh->bio;
bio->bi_sector = mp_bh->master_bio->bi_sector;

- if ((mp_bh->path = multipath_map (conf))<0) {
+ if ((mp_bh->path = multipath_map(conf, bio->bi_sector)) < 0) {
printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
" error for block %llu\n",
bdevname(bio->bi_bdev,b),
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 3c5a45eb5f8a..060fe2aabd97 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -3,6 +3,7 @@

struct multipath_info {
mdk_rdev_t *rdev;
+ sector_t head_position;
};

struct multipath_private_data {
--
1.7.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: [PATCH/RFC] md/multipath: implement I/O balancing

am 14.06.2011 05:59:02 von NeilBrown

On Fri, 10 Jun 2011 20:32:11 +0900 Namhyung Kim wrote:

> Implement basic I/O balancing code (for read/write) for multipath
> personality. The code is based on RAID1 implementation.

Thanks, but no thanks.

As far as I am concerned, the md/multipath implementation is deprecated. The
dm-multipath implementation is much more mature and is more widely used and
actually has a sensible design - unlike md/multipath which has always had a
bad design.

I would rip it out and throw it away if I could, but I believe there are
people who use it so doing that is too difficult.

But I will not be adding feature to it at all.

Thanks,
NeilBrown

>
> Signed-off-by: Namhyung Kim
> ---
> drivers/md/multipath.c | 70 ++++++++++++++++++++++++++++++++++++++---------
> drivers/md/multipath.h | 1 +
> 2 files changed, 57 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
> index 3535c23af288..83c4f5105705 100644
> --- a/drivers/md/multipath.c
> +++ b/drivers/md/multipath.c
> @@ -30,29 +30,58 @@
>
> #define NR_RESERVED_BUFS 32
>
> -
> -static int multipath_map (multipath_conf_t *conf)
> +/*
> + * This routine returns the disk from which the requested read should
> + * be done. There is a per-array 'next expected sequential IO' sector
> + * number - if this matches on the next IO then we use the last disk.
> + * There is also a per-disk 'last know head position' sector that is
> + * maintained from IRQ contexts, IO completion handlers update this
> + * position correctly. We pick the disk whose head is closest.
> + *
> + * Note that 'sector' argument is for original bio whereas 'head_position'
> + * is maintained for each rdev so we should take it into account when
> + * calculating the distance.
> + */
> +static int multipath_map(multipath_conf_t *conf, sector_t sector)
> {
> int i, disks = conf->raid_disks;
> -
> - /*
> - * Later we do read balancing on the read side
> - * now we use the first available disk.
> - */
> + int best_disk;
> + sector_t best_dist;
>
> rcu_read_lock();
> +retry:
> + best_disk = -1;
> + best_dist = MaxSector;
> +
> for (i = 0; i < disks; i++) {
> + int dist;
> mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
> + sector_t this_sector = sector;
> +
> if (rdev && test_bit(In_sync, &rdev->flags)) {
> - atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> - return i;
> + this_sector += rdev->data_offset;
> + dist = abs(this_sector - conf->multipaths[i].head_position);
> + if (dist < best_dist) {
> + best_dist = dist;
> + best_disk = i;
> + }
> }
> }
> +
> + if (best_disk == -1) {
> + printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
> + } else {
> + mdk_rdev_t *rdev;
> +
> + rdev = rcu_dereference(conf->multipaths[best_disk].rdev);
> + if (!rdev || !test_bit(In_sync, &rdev->flags))
> + goto retry;
> +
> + atomic_inc(&rdev->nr_pending);
> + }
> rcu_read_unlock();
>
> - printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
> - return (-1);
> + return best_disk;
> }
>
> static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
> @@ -82,6 +111,17 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
> mempool_free(mp_bh, conf->pool);
> }
>
> +/*
> + * Update disk head position estimator based on IRQ completion info.
> + */
> +static inline void update_head_pos(int disk, struct multipath_bh *mp_bh)
> +{
> + multipath_conf_t *conf = mp_bh->mddev->private;
> +
> + conf->multipaths[disk].head_position =
> + mp_bh->bio.bi_sector + (mp_bh->bio.bi_size >> 9);
> +}
> +
> static void multipath_end_request(struct bio *bio, int error)
> {
> int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
> @@ -89,6 +129,8 @@ static void multipath_end_request(struct bio *bio, int error)
> multipath_conf_t *conf = mp_bh->mddev->private;
> mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
>
> + update_head_pos(mp_bh->path, mp_bh);
> +
> if (uptodate)
> multipath_end_bh_io(mp_bh, 0);
> else if (!(bio->bi_rw & REQ_RAHEAD)) {
> @@ -122,7 +164,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
> mp_bh->master_bio = bio;
> mp_bh->mddev = mddev;
>
> - mp_bh->path = multipath_map(conf);
> + mp_bh->path = multipath_map(conf, bio->bi_sector);
> if (mp_bh->path < 0) {
> bio_endio(bio, -EIO);
> mempool_free(mp_bh, conf->pool);
> @@ -356,7 +398,7 @@ static void multipathd (mddev_t *mddev)
> bio = &mp_bh->bio;
> bio->bi_sector = mp_bh->master_bio->bi_sector;
>
> - if ((mp_bh->path = multipath_map (conf))<0) {
> + if ((mp_bh->path = multipath_map(conf, bio->bi_sector)) < 0) {
> printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
> " error for block %llu\n",
> bdevname(bio->bi_bdev,b),
> diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
> index 3c5a45eb5f8a..060fe2aabd97 100644
> --- a/drivers/md/multipath.h
> +++ b/drivers/md/multipath.h
> @@ -3,6 +3,7 @@
>
> struct multipath_info {
> mdk_rdev_t *rdev;
> + sector_t head_position;
> };
>
> struct multipath_private_data {

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html