[md PATCH 00/13] md patches for the current merge window

[md PATCH 00/13] md patches for the current merge window

am 11.01.2011 23:12:46 von NeilBrown

The following assortment of patches for md are planned to be submitted
for the current merge window, probably in a couple of days. Any
comments most welcome.

Some are bug fixes, some of minor changes to prepare for future work,
nothing is really significant

They are all currently in my for-next branch
git://neil.brown.name/md for-next
and so should appear in -next today or tomorrow.

Thanks,
NeilBrown

---

Joe Perches (1):
md: Fix single printks with multiple KERN_s

Jonathan Brassow (4):
md: separate meta and data devs
md-new-param-to_sync_page_io
md-new-param-to-calc_dev_sboffset
md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer

NeilBrown (7):
md: allow suspend_lo and suspend_hi to decrease as well as increa=
se.
md: Don't let implementation detail of curr_resync leak out throu=
gh sysfs.
md: Be more careful about clearing flags bit in ->recovery
md: md_stop_writes requires mddev_lock.
md: Ensure no IO request to get md device before it is properly i=
nitialised.
md: fix regression resulting in delays in clearing bits in a bitm=
ap
md: fix regression with re-adding devices to arrays with no metad=
ata

Rémi Rérolle (1):
md: fix sync_completed reporting for very large drives (>2TB)


drivers/md/bitmap.c | 12 ++-
drivers/md/md.c | 187 +++++++++++++++++++++++++++++++------------=
--------
drivers/md/md.h | 13 +++-
drivers/md/raid1.c | 33 ++++-----
drivers/md/raid10.c | 17 ++---
drivers/md/raid5.c | 7 +-
6 files changed, 157 insertions(+), 112 deletions(-)

--=20
Signature

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" i=
n
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 07/13] md: Be more careful about clearing flags bit in

am 11.01.2011 23:12:46 von NeilBrown

Setting ->recovery to 0 is generally not a good idea as it could clear
bits that shouldn't be cleared. In particular, MD_RECOVERY_FROZEN
should only be cleared on explicit request from user-space.

So when we need to clear things, just clear the bits that need
clearing.

As there are a few different places which reap a resync process - and
some do an incomplte job - factor out the code for doing the from
md_check_recovery and call that function instead of open coding part
of it.

Signed-off-by: NeilBrown
Reported-by: Jonathan Brassow
---

drivers/md/md.c | 86 +++++++++++++++++++++++++++++++------------------------
1 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f43ff29..a91dc59 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3746,6 +3746,8 @@ action_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", type);
}

+static void reap_sync_thread(mddev_t *mddev);
+
static ssize_t
action_store(mddev_t *mddev, const char *page, size_t len)
{
@@ -3760,9 +3762,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- mddev->recovery = 0;
+ reap_sync_thread(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4709,8 +4709,7 @@ static void __md_stop_writes(mddev_t *mddev)
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
+ reap_sync_thread(mddev);
}

del_timer_sync(&mddev->safemode_timer);
@@ -7044,6 +7043,45 @@ static int remove_and_add_spares(mddev_t *mddev)
}
return spares;
}
+
+static void reap_sync_thread(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+ md_update_sb(mddev, 1);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev->saved_raid_disk = -1;
+
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -7068,9 +7106,6 @@ static int remove_and_add_spares(mddev_t *mddev)
*/
void md_check_recovery(mddev_t *mddev)
{
- mdk_rdev_t *rdev;
-
-
if (mddev->bitmap)
bitmap_daemon_work(mddev);

@@ -7138,34 +7173,7 @@ void md_check_recovery(mddev_t *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- /* resync has finished, collect result */
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* success...*/
- /* activate any spares */
- if (mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
- }
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
- mddev->pers->finish_reshape(mddev);
- md_update_sb(mddev, 1);
-
- /* if array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped
- */
- if (!mddev->degraded)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- rdev->saved_raid_disk = -1;
-
- mddev->recovery = 0;
- /* flag recovery needed just to double check */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- md_new_event(mddev);
+ reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -7223,7 +7231,11 @@ void md_check_recovery(mddev_t *mddev)
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- mddev->recovery = 0;
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 06/13] md: md_stop_writes requires mddev_lock.

am 11.01.2011 23:12:46 von NeilBrown

As md_stop_writes manipulates the sync_thread and calls md_update_sb,
it need to be called with mddev_lock held.

In all internal cases it is, but the symbol is exported for dm-raid to
call and in that case the lock won't be help.
Do make an exported version which takes the lock, and an internal
version which does not.

Signed-off-by: NeilBrown
---

drivers/md/md.c | 13 ++++++++++---
1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 540347c..f43ff29 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4704,7 +4704,7 @@ static void md_clean(mddev_t *mddev)
mddev->plug = NULL;
}

-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
{
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4724,6 +4724,13 @@ void md_stop_writes(mddev_t *mddev)
md_update_sb(mddev, 1);
}
}
+
+void md_stop_writes(mddev_t *mddev)
+{
+ mddev_lock(mddev);
+ __md_stop_writes(mddev);
+ mddev_unlock(mddev);
+}
EXPORT_SYMBOL_GPL(md_stop_writes);

void md_stop(mddev_t *mddev)
@@ -4748,7 +4755,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
goto out;
}
if (mddev->pers) {
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);

err = -ENXIO;
if (mddev->ro==1)
@@ -4785,7 +4792,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mddev->ro)
set_disk_ro(disk, 0);

- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL;


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 05/13] md/raid5: use sysfs_notify_dirent_safe to avoid NULL

am 11.01.2011 23:12:46 von NeilBrown

From: Jonathan Brassow

With the module parameter 'start_dirty_degraded' set,
raid5_spare_active() previously called sysfs_notify_dirent() with a NULL
argument (rdev->sysfs_state) when a rebuild finished.

Signed-off-by: Jonathan Brassow
Signed-off-by: Mike Snitzer
---

drivers/md/raid5.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 316fbe7..9212c07 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5338,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev)
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
- sysfs_notify_dirent(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 04/13] md: Ensure no IO request to get md device before it

am 11.01.2011 23:12:46 von NeilBrown

When an md device is in the process of coming on line it is possible
for an IO request (typically a partition table probe) to get through
before the array is fully initialised, which can cause unexpected
behaviour (e.g. a crash).

So explicitly record when the array is ready for IO and don't allow IO
through until then.

There is no possibility for a similar problem when the array is going
off-line as there must only be one 'open' at that time, and it is busy
off-lining the array and so cannot send IO requests. So no memory
barrier is needed in md_stop()

This has been a bug since commit 409c57f3801 in 2.6.30 which
introduced md_make_request. Before then, each personality would
register its own make_request_fn when it was ready.
This is suitable for any stable kernel from 2.6.30.y onwards.

Cc:
Signed-off-by: NeilBrown
Reported-by: "Hawrylewicz Czarnowski, Przemyslaw"
---

drivers/md/md.c | 8 ++++++--
drivers/md/md.h | 3 ++-
2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a301912..540347c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
int rv;
int cpu;

- if (mddev == NULL || mddev->pers == NULL) {
+ if (mddev == NULL || mddev->pers == NULL
+ || !mddev->ready) {
bio_io_error(bio);
return 0;
}
+ smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
@@ -4564,7 +4566,8 @@ int md_run(mddev_t *mddev)
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
-
+ smp_wmb();
+ mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0) {
char nm[20];
@@ -4725,6 +4728,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);

void md_stop(mddev_t *mddev)
{
+ mddev->ready = 0;
mddev->pers->stop(mddev);
if (mddev->pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d05bab5..229675a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -148,7 +148,8 @@ struct mddev_s
* are happening, so run/
* takeover/stop are not safe
*/
-
+ int ready; /* See when safe to pass
+ * IO requests down */
struct gendisk *gendisk;

struct kobject kobj;


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 03/13] md: Fix single printks with multiple KERN_<level>s

am 11.01.2011 23:12:46 von NeilBrown

From: Joe Perches

Noticed-by: Russell King
Signed-off-by: Joe Perches
Signed-off-by: NeilBrown
---

drivers/md/raid1.c | 5 +++--
drivers/md/raid10.c | 5 +++--
drivers/md/raid5.c | 1 -
3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 845cf95..d50056b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0641674..03825cb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+ printk(KERN_ALERT
+ "md/raid10:%s: Disk failure on %s, disabling device.\n"
+ "md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded);
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc574f3..316fbe7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(Faulty, &rdev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
- KERN_ALERT
"md/raid:%s: Operation continuing on %d devices.\n",
mdname(mddev),
bdevname(rdev->bdev, b),


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 02/13] md: fix regression resulting in delays in clearing

am 11.01.2011 23:12:46 von NeilBrown

commit 589a594be1fb (2.6.37-rc4) fixed a problem were md_thread would
sometimes call the ->run function at a bad time.

If an error is detected during array start up after the md_thread has
been started, the md_thread is killed. This resulted in the ->run
function being called once. However the array may not be in a state
that it is safe to call ->run.

However the fix imposed meant that ->run was not called on a timeout.
This means that when an array goes idle, bitmap bits do not get
cleared promptly. While the array is busy the bits will still be
cleared when appropriate so this is not very serious. There is no
risk to data.

Change the test so that we only avoid calling ->run when the thread
is being stopped. This more explicitly address the problem situation.

This is suitable for any -stable kernel to which 589a594be1fb was
applied.

Cc: stable@kernel.org
Signed-off-by: NeilBrown
---

drivers/md/md.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0da25da..a301912 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6042,7 +6042,8 @@ static int md_thread(void * arg)
|| kthread_should_stop(),
thread->timeout);

- if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+ if (!kthread_should_stop())
thread->run(thread->mddev);
}



--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 01/13] md: fix regression with re-adding devices to arrays

am 11.01.2011 23:12:46 von NeilBrown

Commit 1a855a0606 (2.6.37-rc4) fixed a problem where devices were
re-added when they shouldn't be but caused a regression in a less
common case that means sometimes devices cannot be re-added when they
should be.

In particular, when re-adding a device to an array without metadata
we should always access the device, but after the above commit we
didn't.

This patch sets the In_sync flag in that case so that the re-add
succeeds.

This patch is suitable for any -stable kernel to which 1a855a0606 was
applied.

Cc: stable@kernel.org
Signed-off-by: NeilBrown
---

drivers/md/md.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 175c424..0da25da 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5159,9 +5159,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
/* set saved_raid_disk if appropriate */
if (!mddev->persistent) {
if (info->state & (1< - info->raid_disk < mddev->raid_disks)
+ info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
- else
+ set_bit(In_sync, &rdev->flags);
+ } else
rdev->raid_disk = -1;
} else
super_types[mddev->major_version].


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 10/13] md: separate meta and data devs

am 11.01.2011 23:12:47 von NeilBrown

From: Jonathan Brassow

Allow the metadata to be on a separate device from the
data.

This doesn't mean the data and metadata will by on separate
physical devices - it simply gives device-mapper and userspace
tools more flexibility.

Signed-off-by: NeilBrown
---

drivers/md/bitmap.c | 6 +++++-
drivers/md/md.c | 10 ++++++----
drivers/md/md.h | 6 ++++++
3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1977765..6cf5871 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev = NULL;
+ struct block_device *bdev;
mddev_t *mddev = bitmap->mddev;

while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset;
+
+ bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
if (page->index == bitmap->file_pages-1)
size = roundup(bitmap->last_page_size,
- bdev_logical_block_size(rdev->bdev));
+ bdev_logical_block_size(bdev));
/* Just make sure we aren't corrupting data or
* metadata
*/
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0bc10cc..b98a85f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -765,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
*/
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);

- bio->bi_bdev = rdev->bdev;
+ bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
@@ -803,7 +803,8 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,

rw |= REQ_SYNC | REQ_UNPLUG;

- bio->bi_bdev = rdev->bdev;
+ bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+ rdev->meta_bdev : rdev->bdev;
if (metadata_op)
bio->bi_sector = sector + rdev->sb_start;
else
@@ -4435,7 +4436,9 @@ int md_run(mddev_t *mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere.
*/
- if (rdev->data_offset < rdev->sb_start) {
+ if (rdev->meta_bdev) {
+ /* Nothing to check */;
+ } else if (rdev->data_offset < rdev->sb_start) {
if (mddev->dev_sectors &&
rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
@@ -5532,7 +5535,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
* sb_start or, if that is * of each device. If num_sectors is zero, we find the largest size
* that fits.
-
*/
if (mddev->sync_thread)
return -EBUSY;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7e4f358..eec517c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
mddev_t *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */

+ /*
+ * If meta_bdev is non-NULL, it means that a separate device is
+ * being used to store the metadata (superblock/bitmap) which
+ * would otherwise be contained on the same device as the data (bdev).
+ */
+ struct block_device *meta_bdev;
struct block_device *bdev; /* block device handle */

struct page *sb_page;


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 08/13] md-new-param-to-calc_dev_sboffset

am 11.01.2011 23:12:47 von NeilBrown

From: Jonathan Brassow

When we allow for separate devices for data and metadata
in a later patch, we will need to be able to calculate
the superblock offset based on more than the bdev.

Signed-off-by: Jonathan Brassow
---

drivers/md/md.c | 12 ++++++------
1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a91dc59..0a0d7c2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -705,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
}

/* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
{
- sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+ sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
return MD_NEW_SIZE_SECTORS(num_sectors);
}

@@ -991,7 +991,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
*
* It also happens to be a multiple of 4Kb.
*/
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);

ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
@@ -1332,7 +1332,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_info.offset)
return 0; /* can't move bitmap */
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -5249,7 +5249,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
} else
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
rdev->sectors = rdev->sb_start;

err = bind_rdev_to_array(rdev, mddev);
@@ -5316,7 +5316,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}

if (mddev->persistent)
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
else
rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;



--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 09/13] md-new-param-to_sync_page_io

am 11.01.2011 23:12:47 von NeilBrown

From: Jonathan Brassow

Add new parameter to 'sync_page_io'.

The new parameter allows us to distinguish between metadata and data
operations. This becomes important later when we add the ability to
use separate devices for data and metadata.

Signed-off-by: Jonathan Brassow
---

drivers/md/bitmap.c | 4 ++--
drivers/md/md.c | 9 ++++++---
drivers/md/md.h | 4 ++--
drivers/md/raid1.c | 28 ++++++++++++----------------
drivers/md/raid10.c | 12 ++++++------
5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5a1ffe3..1977765 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
|| test_bit(Faulty, &rdev->flags))
continue;

- target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
+ target = offset + index * (PAGE_SIZE/512);

if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, READ)) {
+ page, READ, true)) {
page->index = index;
attach_page_buffers(page, NULL); /* so that free_buffer will
* quietly no-op */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0a0d7c2..0bc10cc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -795,7 +795,7 @@ static void bi_complete(struct bio *bio, int error)
}

int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw)
+ struct page *page, int rw, bool metadata_op)
{
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
struct completion event;
@@ -804,7 +804,10 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
rw |= REQ_SYNC | REQ_UNPLUG;

bio->bi_bdev = rdev->bdev;
- bio->bi_sector = sector;
+ if (metadata_op)
+ bio->bi_sector = sector + rdev->sb_start;
+ else
+ bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
init_completion(&event);
bio->bi_private = &event;
@@ -829,7 +832,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;


- if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
goto fail;
rdev->sb_loaded = 1;
return 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 229675a..7e4f358 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -498,8 +498,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
- struct page *page, int rw);
+extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d50056b..a23ffa3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1365,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ)) {
+ READ, false)) {
success = 1;
break;
}
@@ -1391,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
rdev = conf->mirrors[d].rdev;
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- WRITE) == 0)
+ WRITE, false) == 0)
md_error(mddev, rdev);
}
d = start;
@@ -1406,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev,
- sect + rdev->data_offset,
+ sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
- READ) == 0)
+ READ, false) == 0)
md_error(mddev, rdev);
}
} else {
@@ -1489,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
success = 1;
else {
d++;
@@ -1515,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
@@ -1532,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ)
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
== 0)
/* Well, this device is dead */
md_error(mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 03825cb..69b6595 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1560,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9,
- conf->tmppage, READ);
+ conf->tmppage, READ, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
if (success)
@@ -1599,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE)
+ sect,
+ s<<9, conf->tmppage, WRITE, false)
== 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
@@ -1636,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
+ sect,
s<<9, conf->tmppage,
- READ) == 0) {
+ READ, false) == 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
"md/raid10:%s: unable to read back "


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 11/13] md: Don"t let implementation detail of curr_resync

am 11.01.2011 23:12:47 von NeilBrown

mddev->curr_resync has artificial values of '1' and '2' which are used
by the which ensures only one resync is happening at a time on any
given device.

These values are internal and should never be exposed to user-space
(except when translated appropriately as in the 'pending' status in
/proc/mdstat).

Unfortunately they are as ->curr_resync is assigned to
->curr_resync_completed and that value is directly visible through
sysfs.

So change the assignments to ->curr_resync_completed to get the same
valued from elsewhere in a form that doesn't have the magic '1' or '2'
values.

Signed-off-by: NeilBrown
---

drivers/md/bitmap.c | 2 +-
drivers/md/md.c | 5 ++---
drivers/md/raid5.c | 4 ++--
3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6cf5871..9a35320 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1546,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0);

- bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+ bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b98a85f..42b1d9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6824,7 +6824,7 @@ void md_do_sync(mddev_t *mddev)
desc, mdname(mddev));
mddev->curr_resync = j;
}
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = j;

while (j < max_sectors) {
sector_t sectors;
@@ -6842,8 +6842,7 @@ void md_do_sync(mddev_t *mddev)
md_unplug(mddev);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
- mddev->curr_resync_completed =
- mddev->curr_resync;
+ mddev->curr_resync_completed = j;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9212c07..d223a6c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4236,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
@@ -4337,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
- mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+ mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 12/13] md: allow suspend_lo and suspend_hi to decrease as

am 11.01.2011 23:12:47 von NeilBrown

The sysfs attributes 'suspend_lo' and 'suspend_hi' describe a region
to which read/writes are suspended so that the under lying data can be
manipulated without user-space noticing.
Currently the window they describe can only move forwards along the
device. However this is an unnecessary restriction which will cause
problems with planned developments.
So relax this restriction and allow these endpoint to move
arbitrarily.

Signed-off-by: NeilBrown
---

drivers/md/md.c | 32 ++++++++++++++++++++------------
1 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42b1d9e..c0cba8d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4016,19 +4016,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_lo;

if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if (new >= mddev->suspend_hi ||
- (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
- mddev->suspend_lo = new;
+
+ mddev->suspend_lo = new;
+ if (new >= old)
+ /* Shrinking suspended region */
mddev->pers->quiesce(mddev, 2);
- return len;
- } else
- return -EINVAL;
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4045,20 +4050,23 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_hi;

if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
- (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
- mddev->suspend_hi = new;
+
+ if (new <= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
- return len;
- } else
- return -EINVAL;
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[md PATCH 13/13] md: fix sync_completed reporting for very large

am 11.01.2011 23:12:47 von NeilBrown

=46rom: Rémi Rérolle

The values exported in the sync_completed file are unsigned long, which
overflows with very large drives, resulting in wrong values reported.

Since sync_completed uses sectors as unit, we'll start getting wrong
values with components larger than 2TB.

This patch simply replaces the use of unsigned long by unsigned long lo=
ng.

Signed-off-by: Rémi Rérolle
Signed-off-by: NeilBrown
---

drivers/md/md.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c0cba8d..126453f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3918,7 +3918,7 @@ static struct md_sysfs_entry md_sync_speed =3D __=
ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
{
- unsigned long max_sectors, resync;
+ unsigned long long max_sectors, resync;
=20
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
@@ -3929,7 +3929,7 @@ sync_completed_show(mddev_t *mddev, char *page)
max_sectors =3D mddev->dev_sectors;
=20
resync =3D mddev->curr_resync_completed;
- return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+ return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
=20
static struct md_sysfs_entry md_sync_completed =3D __ATTR_RO(sync_comp=
leted);


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" i=
n
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html