RAID5 IO處理之重構代碼詳解

1 作用當陣列降級時,可以添加一塊新盤進行重構,以恢復陣列的冗余 。
2 發起重構【RAID5 IO處理之重構代碼詳解】可以通過以下命令md并發起重構:
mdadm -C /dev/md0 --force --run -l 5 -n 3 -c 128K /dev/sd[b-d] --assum-cleanmdadm --manage -f /dev/md0 /dev/sdbmdadm --manage -a /dev/md0 /dev/sde相關代碼邏輯如下:
2.1 設置磁盤異常函數調用關系:
md_ioctl() /* SET_DISK_FAULTY */ \_ set_disk_faulty()\_ md_error()\_ error() /* raid5.c */\_ md_wakeup_thread() /* raid5d */raid5d() \_ md_check_recovery()\_ remove_and_add_spares()這里主要是設置成員磁盤異常的邏輯,代碼邏輯如下:
static void error(struct mddev *mddev, struct md_rdev *rdev){ spin_lock_irqsave(&conf->device_lock, flags); /* 清除成員磁盤“同步”狀態標記 */ clear_bit(In_sync, &rdev->flags); /* 重新計算md降級狀態 */ mddev->degraded = calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); /* 打斷同步 */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* 設置成員磁盤為異常狀態 */ set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); /* 設置md發生磁盤狀態改變 */ set_bit(MD_CHANGE_DEVS, &mddev->flags);}2.2 添加新盤函數調用關系:
md_ioctl() /* ADD_NEW_DISK */ \_ add_new_disk()\_ md_wakeup_thread() /* raid5d */raid5d() \_ md_check_recovery()\_ remove_and_add_spares()\_ md_register_thread() /* md_do_sync */md_do_sync() \_ sync_request() /* raid5d */這里需要注意,在加盤時沒有像前文replacement中描述的那樣設置磁盤為WantReplacement狀態,所以不會將新的磁盤賦值給舊盤的replacement指針 。主要為設置重構相關標記,邏輯如下:
static int remove_and_add_spares(struct mddev *mddev,struct md_rdev *this){ int spares = 0; rdev_for_each(rdev, mddev) {/* 新添加的磁盤未設置相關標記自增spares */if (rdev->raid_disk >= 0 &&!test_bit(In_sync, &rdev->flags) &&!test_bit(Journal, &rdev->flags) &&!test_bit(Faulty, &rdev->flags))spares++; } return spares;}void md_check_recovery(struct mddev *mddev){ if (mddev_trylock(mddev)) {int spares = 0;/* 如上描述,在remove_and_add_spares返回spares為1 */if ((spares = remove_and_add_spares(mddev, NULL))) {/* 設置md為重構狀態 */clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);}if (mddev->pers->sync_request) {if (spares) {/* We are adding a device or devices to an array* which has the bitmap stored on all devices.* So make sure all bitmap pages get written*/bitmap_write_all(mddev->bitmap);}/* 創建重構線程 */mddev->sync_thread = md_register_thread(md_do_sync,mddev,"resync");/* 創建失敗則清除相關標記 */if (!mddev->sync_thread) {printk(KERN_ERR "%s: could not start resync"" thread...\n",mdname(mddev));/* leave the spares where they are, it shouldn't hurt */clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);} else {/* 成功則喚醒線程開始執行 */md_wakeup_thread(mddev->sync_thread);}}mddev_unlock(mddev); }}static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster){ /* 獲取一個空閑條帶 */ sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) {sh = get_active_stripe(conf, sector_nr, 0, 0, 0);schedule_timeout_uninterruptible(1); } /* 設置同步標記 */ set_bit(STRIPE_SYNC_REQUESTED, &sh->state); /* 將條帶推入條帶狀態機處理 */ handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS;}3 條帶處理我們依舊通過分析條帶各輪次的處理來解析重構過程中代碼執行流程及IO發生的情況 。
3.1 下發讀請求函數調用關系:
handle_stripe() \_ analyse_stripe() \_ handle_stripe_fill()\_ fetch_block() \_ ops_run_io()代碼邏輯如下:
static void handle_stripe(struct stripe_head *sh){ /* 在sync_request中設置了該標記 */ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {spin_lock(&sh->stripe_lock);/* 此時條帶不是處理DISCARD請求 */if (!test_bit(STRIPE_DISCARD, &sh->state)/* 清掉STRIPE_SYNC_REQUESTED標記 */&& test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {/* 設置條帶同步中標記 */set_bit(STRIPE_SYNCING, &sh->state);/* 清除條帶一致狀態的標記 */clear_bit(STRIPE_INSYNC, &sh->state);}spin_unlock(&sh->stripe_lock); } clear_bit(STRIPE_DELAYED, &sh->state); /* 解析條帶狀態 */ analyse_stripe(sh, &s); /* s.syncing為真且第一輪條帶處理時s.uptodate + s.compute等于0條件滿足進入handle_stripe_fill */ if (s.to_read || s.non_overwrite|| (conf->level == 6 && s.to_write && s.failed)|| (s.syncing && (s.uptodate + s.compute < disks))|| s.replacing|| s.expanding)handle_stripe_fill(sh, &s, disks); /* 此時 s.locked == 0 條件不成立不會進入該if分支 */ if ((s.syncing || s.replacing) && s.locked == 0&& test_bit(STRIPE_INSYNC, &sh->state)) {md_done_sync(conf->mddev, STRIPE_SECTORS, 1);clear_bit(STRIPE_SYNCING, &sh->state);if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))wake_up(&conf->wait_for_overlap); } /* 下發讀請求 */ ops_run_io(sh, &s);}static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s){ int do_recovery = 0; /* 遍歷所有條帶/設備 */ rcu_read_lock(); for (i=disks; i--; ) {/* 新加入的成員磁盤重構完成之前不處于同步狀態 , 滿足if條件 */if (!test_bit(R5_Insync, &dev->flags)) {/* 加上raid6在內最大支持壞2塊磁盤 */if (s->failed < 2)s->failed_num[s->failed] = i;/* 自增failed */s->failed++;/* rdev指向新盤且新盤不是Faulty狀態(舊盤是),滿足if條件設置do_recovery */if (rdev && !test_bit(Faulty, &rdev->flags))do_recovery = 1;} } /* 在handle_stripe中設置了該標記 */ if (test_bit(STRIPE_SYNCING, &sh->state)) {/* do_recovery條件滿足,設置 s->syncing = 1 表明條帶在做重構 */if (do_recovery|| sh->sector >= conf->mddev->recovery_cp|| test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))s->syncing = 1;elses->replacing = 1; } rcu_read_unlock();}static void handle_stripe_fill(struct stripe_head *sh,struct stripe_head_state *s,int disks){ int i; /* 未設置條帶狀態進入fetch_block */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state&& !sh->reconstruct_state)for (i = disks; i--; )if (fetch_block(sh, s, i, disks))break; set_bit(STRIPE_HANDLE, &sh->state);}static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,int disk_idx, int disks){ struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],&sh->dev[s->failed_num[1]] }; /* 此時所有條帶/設備都未發起請求且未包含最新數據 */ /* 滿足s->syncing條件進入第一層if */ if (!test_bit(R5_LOCKED, &dev->flags)&& !test_bit(R5_UPTODATE, &dev->flags)&& (dev->toread|| (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))|| s->syncing || s->expanding|| (s->replacing && want_replace(sh, disk_idx))|| (s->failed >= 1 && fdev[0]->toread)|| (s->failed >= 2 && fdev[1]->toread)|| (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite&& !test_bit(R5_OVERWRITE, &fdev[0]->flags))|| (sh->raid_conf->level == 6 && s->failed && s->to_write))) {/* we would like to get this block, possibly by computing it,* otherwise read it if the backing disk is insync*/BUG_ON(test_bit(R5_Wantcompute, &dev->flags));BUG_ON(test_bit(R5_Wantread, &dev->flags));/** 對所有正??勺x的成員磁盤下發讀請求* 需要注意的是,如果是raid5,因為只有一個冗余,因此重構是需要向所有其他磁盤下發讀的* 但是如果是raid6,因為有兩個冗余,在只有一個成員磁盤異常的情況下* 可以少讀一塊盤 , 但是實際沒有這么做還是都讀了,在后續處理中會用* 計算出來的值和讀出來的值進行比較如果不相等則重新寫一次進行修復*/if (test_bit(R5_Insync, &dev->flags)) {set_bit(R5_LOCKED, &dev->flags);set_bit(R5_Wantread, &dev->flags);/* 自增locked計數 */s->locked++;} } return 0;}static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s){ /* 遍歷所有條帶/設備 */ for (i = disks; i--; ) {/* 對設置了讀標記的下發讀請求 */if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))rw = READ;/* 跳過其他不需要讀的設備 */elsecontinue;if (rdev) {bio_reset(bi);bi->bi_bdev = rdev->bdev;bi->bi_rw = rw;bi->bi_end_io = raid5_end_read_request;bi->bi_private = sh;atomic_inc(&sh->count);if (use_new_offset(conf, sh))bi->bi_sector = (sh->sector + rdev->new_data_offset);elsebi->bi_sector = (sh->sector + rdev->data_offset);if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))bi->bi_rw |= REQ_FLUSH;bi->bi_vcnt = 1;bi->bi_io_vec[0].bv_len = STRIPE_SIZE;bi->bi_io_vec[0].bv_offset = 0;bi->bi_size = STRIPE_SIZE;/* 提交bio */generic_make_request(bi);} }}

推薦閱讀