whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 7abe849315c870c1d3f3cb4b302e827aaa28348e
parent 71f4d95b23654ec2b347bd15b1260d68ca9ea5ea
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Fri, 26 Oct 2018 13:00:44 -0700

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull md updates from Shaohua Li:
 "This mainly improves raid10 cluster and fixes some bugs:

   - raid10 cluster improvements from Guoqing

   - Memory leak fixes from Jack and Xiao

   - raid10 hang fix from Alex

   - raid5 block faulty device fix from Mariusz

   - metadata update fix from Neil

   - Invalid disk role fix from Me

   - Other clearnups"

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  MD: Memory leak when flush bio size is zero
  md: fix memleak for mempool
  md-cluster: remove suspend_info
  md-cluster: send BITMAP_NEEDS_SYNC message if reshaping is interrupted
  md-cluster/bitmap: don't call md_bitmap_sync_with_cluster during reshaping stage
  md-cluster/raid10: don't call remove_and_add_spares during reshaping stage
  md-cluster/raid10: call update_size in md_reap_sync_thread
  md-cluster: introduce resync_info_get interface for sanity check
  md-cluster/raid10: support add disk under grow mode
  md-cluster/raid10: resize all the bitmaps before start reshape
  MD: fix invalid stored role for a disk - try2
  md/bitmap: use mddev_suspend/resume instead of ->quiesce()
  md: remove redundant code that is no longer reachable
  md: allow metadata updates while suspending an array - fix
  MD: fix invalid stored role for a disk
  md/raid10: Fix raid10 replace hang when new added disk faulty
  raid5: block failing device if raid will be failed

Diffstat:
Mdrivers/md/md-bitmap.c | 9+++++----
Mdrivers/md/md-cluster.c | 234+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Mdrivers/md/md-cluster.h | 2++
Mdrivers/md/md.c | 113++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mdrivers/md/md.h | 1+
Mdrivers/md/raid1.c | 1+
Mdrivers/md/raid10.c | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mdrivers/md/raid5-cache.c | 2--
Mdrivers/md/raid5.c | 12++++++++++++
9 files changed, 356 insertions(+), 127 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c @@ -2288,9 +2288,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len) goto out; } if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { @@ -2327,8 +2327,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.offset = offset; if (mddev->pers) { struct bitmap *bitmap; - mddev->pers->quiesce(mddev, 1); bitmap = md_bitmap_create(mddev, -1); + mddev_suspend(mddev); if (IS_ERR(bitmap)) rv = PTR_ERR(bitmap); else { @@ -2337,11 +2337,12 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (rv) mddev->bitmap_info.offset = 0; } - mddev->pers->quiesce(mddev, 0); if (rv) { md_bitmap_destroy(mddev); + mddev_resume(mddev); goto out; } + mddev_resume(mddev); } } } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c @@ -33,13 +33,6 @@ struct dlm_lock_resource { int mode; }; -struct suspend_info { - int slot; - sector_t lo; - sector_t hi; - struct list_head list; -}; - struct resync_info { __le64 lo; __le64 hi; @@ -80,7 +73,13 @@ struct md_cluster_info { struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; + spinlock_t suspend_lock; + /* record the region which write should be suspended */ + sector_t suspend_lo; + sector_t suspend_hi; + int suspend_from; /* the slot which broadcast suspend_lo/hi */ + struct md_thread *recovery_thread; unsigned long recovery_map; /* communication loc resources */ @@ -105,6 +104,7 @@ enum msg_type { RE_ADD, BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, + BITMAP_RESIZE, }; struct cluster_msg { @@ -270,25 +270,22 @@ static void add_resync_info(struct dlm_lock_resource *lockres, ri->hi = cpu_to_le64(hi); } -static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) +static int read_resync_info(struct mddev *mddev, + struct dlm_lock_resource *lockres) { struct resync_info ri; - struct suspend_info *s = NULL; - sector_t hi = 0; + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; dlm_lock_sync(lockres, DLM_LOCK_CR); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); - hi = le64_to_cpu(ri.hi); - if (hi > 0) { - s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); - if (!s) - goto out; - s->hi = hi; - s->lo = le64_to_cpu(ri.lo); + if (le64_to_cpu(ri.hi) > 0) { + cinfo->suspend_hi = le64_to_cpu(ri.hi); + cinfo->suspend_lo = le64_to_cpu(ri.lo); + ret = 1; } dlm_unlock_sync(lockres); -out: - return s; + return ret; } static void recover_bitmaps(struct md_thread *thread) @@ -298,7 +295,6 @@ static void recover_bitmaps(struct md_thread *thread) struct dlm_lock_resource *bm_lockres; char str[64]; int slot, ret; - struct suspend_info *s, *tmp; sector_t lo, hi; while (cinfo->recovery_map) { @@ -325,13 +321,17 @@ static void recover_bitmaps(struct md_thread *thread) /* Clear suspend_area associated with the bitmap */ spin_lock_irq(&cinfo->suspend_lock); - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - } + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; + cinfo->suspend_from = -1; spin_unlock_irq(&cinfo->suspend_lock); + /* Kick off a reshape if needed */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->reshape_position != MaxSector) + md_wakeup_thread(mddev->sync_thread); + if (hi > 0) { if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; @@ -434,34 +434,23 @@ static void ack_bast(void *arg, int mode) } } -static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) -{ - struct suspend_info *s, *tmp; - - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - break; - } -} - static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); - __remove_suspend_info(cinfo, slot); + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; spin_unlock_irq(&cinfo->suspend_lock); mddev->pers->quiesce(mddev, 0); } - static void process_suspend_info(struct mddev *mddev, int slot, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct suspend_info *s; + struct mdp_superblock_1 *sb = NULL; + struct md_rdev *rdev; if (!hi) { /* @@ -475,6 +464,12 @@ static void process_suspend_info(struct mddev *mddev, return; } + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + sb = page_address(rdev->sb_page); + break; + } + /* * The bitmaps are not same for different nodes * if RESYNCING is happening in one node, then @@ -487,26 +482,26 @@ static void process_suspend_info(struct mddev *mddev, * sync_low/hi is used to record the region which * arrived in the previous RESYNCING message, * - * Call bitmap_sync_with_cluster to clear - * NEEDED_MASK and set RESYNC_MASK since - * resync thread is running in another node, - * so we don't need to do the resync again - * with the same section */ - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi); + * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK + * and set RESYNC_MASK since resync thread is running + * in another node, so we don't need to do the resync + * again with the same section. + * + * Skip md_bitmap_sync_with_cluster in case reshape + * happening, because reshaping region is small and + * we don't want to trigger lots of WARN. + */ + if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) + md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; - s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); - if (!s) - return; - s->slot = slot; - s->lo = lo; - s->hi = hi; mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); - /* Remove existing entry (if exists) before adding */ - __remove_suspend_info(cinfo, slot); - list_add(&s->list, &cinfo->suspend_list); + cinfo->suspend_from = slot; + cinfo->suspend_lo = lo; + cinfo->suspend_hi = hi; spin_unlock_irq(&cinfo->suspend_lock); mddev->pers->quiesce(mddev, 0); } @@ -612,6 +607,11 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case BITMAP_NEEDS_SYNC: __recover_slot(mddev, le32_to_cpu(msg->slot)); break; + case BITMAP_RESIZE: + if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) + ret = md_bitmap_resize(mddev->bitmap, + le64_to_cpu(msg->high), 0, 0); + break; default: ret = -1; pr_warn("%s:%d Received unknown message from %d\n", @@ -800,7 +800,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) struct md_cluster_info *cinfo = mddev->cluster_info; int i, ret = 0; struct dlm_lock_resource *bm_lockres; - struct suspend_info *s; char str[64]; sector_t lo, hi; @@ -819,16 +818,13 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - s = read_resync_info(mddev, bm_lockres); - if (s) { + if (read_resync_info(mddev, bm_lockres)) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", __func__, __LINE__, - (unsigned long long) s->lo, - (unsigned long long) s->hi, i); - spin_lock_irq(&cinfo->suspend_lock); - s->slot = i; - list_add(&s->list, &cinfo->suspend_list); - spin_unlock_irq(&cinfo->suspend_lock); + (unsigned long long) cinfo->suspend_lo, + (unsigned long long) cinfo->suspend_hi, + i); + cinfo->suspend_from = i; } ret = 0; lockres_free(bm_lockres); @@ -1001,10 +997,17 @@ static int leave(struct mddev *mddev) if (!cinfo) return 0; - /* BITMAP_NEEDS_SYNC message should be sent when node + /* + * BITMAP_NEEDS_SYNC message should be sent when node * is leaving the cluster with dirty bitmap, also we - * can only deliver it when dlm connection is available */ - if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) + * can only deliver it when dlm connection is available. + * + * Also, we should send BITMAP_NEEDS_SYNC message in + * case reshaping is interrupted. + */ + if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + (mddev->reshape_position != MaxSector && + test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); @@ -1102,6 +1105,80 @@ static void metadata_update_cancel(struct mddev *mddev) unlock_comm(cinfo); } +static int update_bitmap_size(struct mddev *mddev, sector_t size) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int ret; + + cmsg.type = cpu_to_le32(BITMAP_RESIZE); + cmsg.high = cpu_to_le64(size); + ret = sendmsg(cinfo, &cmsg, 0); + if (ret) + pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n", + __func__, __LINE__, ret); + return ret; +} + +static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) +{ + struct bitmap_counts *counts; + char str[64]; + struct dlm_lock_resource *bm_lockres; + struct bitmap *bitmap = mddev->bitmap; + unsigned long my_pages = bitmap->counts.pages; + int i, rv; + + /* + * We need to ensure all the nodes can grow to a larger + * bitmap size before make the reshaping. + */ + rv = update_bitmap_size(mddev, newsize); + if (rv) + return rv; + + for (i = 0; i < mddev->bitmap_info.nodes; i++) { + if (i == md_cluster_ops->slot_number(mddev)) + continue; + + bitmap = get_bitmap_from_slot(mddev, i); + if (IS_ERR(bitmap)) { + pr_err("can't get bitmap from slot %d\n", i); + goto out; + } + counts = &bitmap->counts; + + /* + * If we can hold the bitmap lock of one node then + * the slot is not occupied, update the pages. + */ + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("Cannot initialize %s lock\n", str); + goto out; + } + bm_lockres->flags |= DLM_LKF_NOQUEUE; + rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (!rv) + counts->pages = my_pages; + lockres_free(bm_lockres); + + if (my_pages != counts->pages) + /* + * Let's revert the bitmap size if one node + * can't resize bitmap + */ + goto out; + } + + return 0; +out: + md_bitmap_free(bitmap); + update_bitmap_size(mddev, oldsize); + return -1; +} + /* * return 0 if all the bitmaps have the same sync_size */ @@ -1243,6 +1320,16 @@ static int resync_start(struct mddev *mddev) return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); } +static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + spin_lock_irq(&cinfo->suspend_lock); + *lo = cinfo->suspend_lo; + *hi = cinfo->suspend_hi; + spin_unlock_irq(&cinfo->suspend_lock); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1295,21 +1382,14 @@ static int area_resyncing(struct mddev *mddev, int direction, { struct md_cluster_info *cinfo = mddev->cluster_info; int ret = 0; - struct suspend_info *s; if ((direction == READ) && test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) return 1; spin_lock_irq(&cinfo->suspend_lock); - if (list_empty(&cinfo->suspend_list)) - goto out; - list_for_each_entry(s, &cinfo->suspend_list, list) - if (hi > s->lo && lo < s->hi) { - ret = 1; - break; - } -out: + if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi) + ret = 1; spin_unlock_irq(&cinfo->suspend_lock); return ret; } @@ -1482,6 +1562,7 @@ static struct md_cluster_operations cluster_ops = { .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, @@ -1492,6 +1573,7 @@ static struct md_cluster_operations cluster_ops = { .remove_disk = remove_disk, .load_bitmaps = load_bitmaps, .gather_bitmaps = gather_bitmaps, + .resize_bitmaps = resize_bitmaps, .lock_all_bitmaps = lock_all_bitmaps, .unlock_all_bitmaps = unlock_all_bitmaps, .update_size = update_size, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h @@ -14,6 +14,7 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); void (*metadata_update_cancel)(struct mddev *mddev); @@ -26,6 +27,7 @@ struct md_cluster_operations { int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); void (*load_bitmaps)(struct mddev *mddev, int total_slots); int (*gather_bitmaps)(struct md_rdev *rdev); + int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize); int (*lock_all_bitmaps)(struct mddev *mddev); void (*unlock_all_bitmaps)(struct mddev *mddev); void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); diff --git a/drivers/md/md.c b/drivers/md/md.c @@ -452,10 +452,11 @@ static void md_end_flush(struct bio *fbio) rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) + if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); - else { + mempool_free(fi, mddev->flush_pool); + } else { INIT_WORK(&fi->flush_work, submit_flushes); queue_work(md_wq, &fi->flush_work); } @@ -509,10 +510,11 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) rcu_read_unlock(); if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) + if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); - else { + mempool_free(fi, mddev->flush_pool); + } else { INIT_WORK(&fi->flush_work, submit_flushes); queue_work(md_wq, &fi->flush_work); } @@ -5904,14 +5906,6 @@ static void __md_stop(struct mddev *mddev) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); -} - -void md_stop(struct mddev *mddev) -{ - /* stop the array and free an attached data structures. - * This is called from dm-raid - */ - __md_stop(mddev); if (mddev->flush_bio_pool) { mempool_destroy(mddev->flush_bio_pool); mddev->flush_bio_pool = NULL; @@ -5920,6 +5914,14 @@ void md_stop(struct mddev *mddev) mempool_destroy(mddev->flush_pool); mddev->flush_pool = NULL; } +} + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); } @@ -8370,9 +8372,17 @@ void md_do_sync(struct md_thread *thread) else if (!mddev->bitmap) j = mddev->recovery_cp; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { max_sectors = mddev->resync_max_sectors; - else { + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set j again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + j = mddev->reshape_position; + } else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; @@ -8623,8 +8633,10 @@ void md_do_sync(struct md_thread *thread) mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!mddev_is_clustered(mddev)) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } spin_lock(&mddev->lock); @@ -8790,6 +8802,18 @@ static void md_start_sync(struct work_struct *ws) */ void md_check_recovery(struct mddev *mddev) { + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); + } + if (mddev->suspended) return; @@ -8949,16 +8973,6 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); - } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); @@ -8966,6 +8980,8 @@ EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; + sector_t old_dev_sectors = mddev->dev_sectors; + bool is_reshaped = false; /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); @@ -8980,8 +8996,11 @@ void md_reap_sync_thread(struct mddev *mddev) } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) + mddev->pers->finish_reshape) { mddev->pers->finish_reshape(mddev); + if (mddev_is_clustered(mddev)) + is_reshaped = true; + } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. @@ -9002,6 +9021,14 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* + * We call md_cluster_ops->update_size here because sync_size could + * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, + * so it is time to update size across cluster. + */ + if (mddev_is_clustered(mddev) && is_reshaped + && !test_bit(MD_CLOSING, &mddev->flags)) + md_cluster_ops->update_size(mddev, old_dev_sectors); wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9201,8 +9228,12 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } if (role != rdev2->raid_disk) { - /* got activated */ - if (rdev2->raid_disk == -1 && role != 0xffff) { + /* + * got activated except reshape is happening. + */ + if (rdev2->raid_disk == -1 && role != 0xffff && + !(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", @@ -9228,6 +9259,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + /* + * Since mddev->delta_disks has already updated in update_raid_disks, + * so it is time to check reshape. + */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* + * reshape is happening in the remote node, we need to + * update reshape_position and call start_reshape. + */ + mddev->reshape_position = sb->reshape_position; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + if (mddev->pers->start_reshape) + mddev->pers->start_reshape(mddev); + } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + mddev->reshape_position != MaxSector && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* reshape is just done in another node. */ + mddev->reshape_position = MaxSector; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + } + /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); } diff --git a/drivers/md/md.h b/drivers/md/md.h @@ -557,6 +557,7 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); + void (*update_reshape_pos) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c @@ -1734,6 +1734,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->raid_disks && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) first = last = rdev->saved_raid_disk; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c @@ -25,6 +25,7 @@ #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/kthread.h> +#include <linux/raid/md_p.h> #include <trace/events/block.h> #include "md.h" #include "raid10.h" @@ -1808,6 +1809,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->geo.raid_disks && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else @@ -3079,6 +3081,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; + int need_recover = 0; + int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3086,11 +3090,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mrdev = rcu_dereference(mirror->rdev); mreplace = rcu_dereference(mirror->replacement); - if ((mrdev == NULL || - test_bit(Faulty, &mrdev->flags) || - test_bit(In_sync, &mrdev->flags)) && - (mreplace == NULL || - test_bit(Faulty, &mreplace->flags))) { + if (mrdev != NULL && + !test_bit(Faulty, &mrdev->flags) && + !test_bit(In_sync, &mrdev->flags)) + need_recover = 1; + if (mreplace != NULL && + !test_bit(Faulty, &mreplace->flags)) + need_replace = 1; + + if (!need_recover && !need_replace) { rcu_read_unlock(); continue; } @@ -3213,7 +3221,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - if (!test_bit(In_sync, &mrdev->flags)) { + if (need_recover) { bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -3230,16 +3238,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - /* Note: if mreplace != NULL, then bio + /* Note: if need_replace, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. - * So the second test here is pointless. - * But it keeps semantic-checkers happy, and - * this comment keeps human reviewers - * happy. */ - if (mreplace == NULL || bio == NULL || - test_bit(Faulty, &mreplace->flags)) + if (!need_replace) break; bio->bi_next = biolist; biolist = bio; @@ -4286,12 +4289,46 @@ static int raid10_start_reshape(struct mddev *mddev) spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { - ret = md_bitmap_resize(mddev->bitmap, - raid10_size(mddev, 0, conf->geo.raid_disks), - 0, 0); + struct mdp_superblock_1 *sb = NULL; + sector_t oldsize, newsize; + + oldsize = raid10_size(mddev, 0, 0); + newsize = raid10_size(mddev, 0, conf->geo.raid_disks); + + if (!mddev_is_clustered(mddev)) { + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + goto abort; + else + goto out; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk > -1 && + !test_bit(Faulty, &rdev->flags)) + sb = page_address(rdev->sb_page); + } + + /* + * some node is already performing reshape, and no need to + * call md_bitmap_resize again since it should be called when + * receiving BITMAP_RESIZE msg + */ + if ((sb && (le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) + goto out; + + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); if (ret) goto abort; + + ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + if (ret) { + md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + goto abort; + } } +out: if (mddev->delta_disks > 0) { rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 && @@ -4568,6 +4605,32 @@ read_more: r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; + /* + * Broadcast RESYNC message to other nodes, so all nodes would not + * write to the region to avoid conflict. + */ + if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { + struct mdp_superblock_1 *sb = NULL; + int sb_reshape_pos = 0; + + conf->cluster_sync_low = sector_nr; + conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; + sb = page_address(rdev->sb_page); + if (sb) { + sb_reshape_pos = le64_to_cpu(sb->reshape_position); + /* + * Set cluster_sync_low again if next address for array + * reshape is less than cluster_sync_low. Since we can't + * update cluster_sync_low until it has finished reshape. + */ + if (sb_reshape_pos < conf->cluster_sync_low) + conf->cluster_sync_low = sb_reshape_pos; + } + + md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* Now find the locations in the new layout */ __raid10_find_phys(&conf->geo, r10_bio); @@ -4719,6 +4782,19 @@ static void end_reshape(struct r10conf *conf) conf->fullsync = 0; } +static void raid10_update_reshape_pos(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + sector_t lo, hi; + + md_cluster_ops->resync_info_get(mddev, &lo, &hi); + if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) + || mddev->reshape_position == MaxSector) + conf->reshape_progress = mddev->reshape_position; + else + WARN_ON_ONCE(1); +} + static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio) { @@ -4887,6 +4963,7 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .update_reshape_pos = raid10_update_reshape_pos, .congested = raid10_congested, }; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c @@ -3151,8 +3151,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; - rcu_assign_pointer(conf->log, NULL); - md_unregister_thread(&log->reclaim_thread); reclaim_thread: mempool_exit(&log->meta_pool); out_mempool: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c @@ -2681,6 +2681,18 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) pr_debug("raid456: error called\n"); spin_lock_irqsave(&conf->device_lock, flags); + + if (test_bit(In_sync, &rdev->flags) && + mddev->degraded == conf->max_degraded) { + /* + * Don't allow to achieve failed state + * Don't try to recover this device + */ + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } + set_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); mddev->degraded = raid5_calc_degraded(conf);