RAID5 IO處理之條帶讀代碼詳解

除了對齊讀流程中讀失敗通過條帶重試的場景會進入到條帶讀,當IO覆蓋范圍超過一個chunk時也會進入條帶讀(如向chunk為4K的RAID下發起始位置為1K大小為4K的IO),接下來我們就這部分邏輯進行分析 。
1 IO加入鏈表首先 bio 通過 add_stripe_bio() 函數被掛載到條帶頭指向成員磁盤設備的 toread上 , 代碼如下所示:
【RAID5 IO處理之條帶讀代碼詳解】/* 只保留讀請求相關處理邏輯 */static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite){ struct bio **bip; struct r5conf *conf = sh->raid_conf; spin_lock_irq(&sh->stripe_lock); /** 獲取bio所在dev的用于保存toread的地址* 后續bio插入時會根據其起始位置進行排序* 這里使用二級指針便于后續的插入操作*/ bip = &sh->dev[dd_idx].toread; /** 遍歷當前需要讀的bio,判斷是否存在bio覆蓋范圍重疊的場景* 如果有重疊則跳轉到overlap設置標記后返回0退出* 需要等待已存在的導致重疊的bio執行完畢后才能再次執行*/ while (*bip && (*bip)->bi_sector < bi->bi_sector) {if (bio_end_sector(*bip) > bi->bi_sector)goto overlap;bip = & (*bip)->bi_next; } if (*bip && (*bip)->bi_sector < bio_end_sector(bi))goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); /* 將bio根據起始位置順序插入到toread的bio鏈表中等待處理 */ if (*bip)bi->bi_next = *bip; *bip = bi; /* 增加bio的bi_phys_segments計數 */ raid5_inc_bi_active_stripes(bi); spin_unlock_irq(&sh->stripe_lock); return 1;overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&sh->stripe_lock); return 0;}2 條帶處理條帶處理的函數入口為 handle_active_stripes(),代碼如下所示:
#define MAX_STRIPE_BATCH 8static int handle_active_stripes(struct r5conf *conf){ struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH &&/* 根據優先級獲取一個待處理條帶 */(sh = __get_priority_stripe(conf)) != NULL)batch[batch_size++] = sh; if (batch_size == 0)return batch_size; spin_unlock_irq(&conf->device_lock); /* 調用handle_stripe函數處理條帶 */ for (i = 0; i < batch_size; i++)handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); for (i = 0; i < batch_size; i++)__release_stripe(conf, batch[i]); return batch_size;}handle_stripe() 是條帶處理的主要函數 。一個條帶從開始到結束需要調用幾次 handle_stripe() 及相關函數 。本文討論如下四種場景:

  • 讀成功
  • IO所在磁盤異常
  • 讀IO報錯
  • 陣列超冗余
接下來根據不同場景下每輪處理的內容進行代碼分析 , 貼出的代碼只包含當前處理的相關內容 。
2.1 讀成功正常的條帶讀會經過以下三輪的條帶處理,讀取成功后將數據返回給調用者 。
2.1.1 下發讀請求函數調用關系如下:
handle_stripe() \_ analyse_stripe() \_ handle_stripe_fill()\_ fetch_block() \_ ops_run_io()各函數執行的代碼邏輯如下:
static void handle_stripe(struct stripe_head *sh){ /* 調用analyse_stripe解析條帶狀態 */ analyse_stripe(sh, &s); /* s.to_read條件為真進入handle_stripe_fill */ if (s.to_read || s.non_overwrite|| (conf->level == 6 && s.to_write && s.failed)|| (s.syncing && (s.uptodate + s.compute < disks))|| s.replacing|| s.expanding)handle_stripe_fill(sh, &s, disks); /* 調用ops_run_io檢查是否有需要調度的請求 */ ops_run_io(sh, &s);}static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s){ rcu_read_lock(); for (i = disks; i--; ) {/* 統計讀請求 */if (dev->toread)s->to_read++;/* 條帶/設備狀態正常 */if (test_bit(In_sync, &rdev->flags))set_bit(R5_Insync, &dev->flags); } rcu_read_unlock();}static void handle_stripe_fill(struct stripe_head *sh,struct stripe_head_state *s,int disks){ int i; /* 當前條帶狀態沒有設置標記,滿足條件判斷進入if */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&!sh->reconstruct_state)for (i = disks; i--; )if (fetch_block(sh, s, i, disks))break; set_bit(STRIPE_HANDLE, &sh->state);}static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,int disk_idx, int disks){ struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],&sh->dev[s->failed_num[1]] }; /* dev尚未下發IO所以未設置R5_LOCKED和R5_UPTODATE標記 */ if (!test_bit(R5_LOCKED, &dev->flags) &&!test_bit(R5_UPTODATE, &dev->flags) &&/* dev->toread條件為真,進入最外層if判斷 */dev->toread) {/* 在analyse_stripe中設置了R5_Insync */if (test_bit(R5_Insync, &dev->flags)) {/* 設置R5_LOCKED標記表明對應磁盤正在進行IO處理 */set_bit(R5_LOCKED, &dev->flags);/* 設置R5_Wantread標記表明需要下發讀請求 */set_bit(R5_Wantread, &dev->flags);/* 統計在進行IO操作的dev的計數 */s->locked++;} } return 0;}static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s){ struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); for (i = disks; i--; ) {bi = &sh->dev[i].req;rbi = &sh->dev[i].rreq; /* For writing to replacement */rcu_read_lock();rrdev = rcu_dereference(conf->disks[i].replacement);smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */rdev = rcu_dereference(conf->disks[i].rdev);if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))rw = WRITE_FUA;elserw = WRITE;if (test_bit(R5_Discard, &sh->dev[i].flags))rw |= REQ_DISCARD;} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))/* 設置為請求類型為讀 */rw = READ;else if (test_and_clear_bit(R5_WantReplace,&sh->dev[i].flags)) {rw = WRITE;replace_only = 1;} else/* 其余跳過 */continue;if (rdev) {set_bit(STRIPE_IO_STARTED, &sh->state);/** 設置bio參數* 包括重新設置bio指向的塊設備,起始位置,IO完成回調函數*/bio_reset(bi);bi->bi_bdev = rdev->bdev;bi->bi_rw = rw;bi->bi_end_io = (rw & WRITE)? raid5_end_write_request: raid5_end_read_request;bi->bi_private = sh;atomic_inc(&sh->count);if (use_new_offset(conf, sh))bi->bi_sector = (sh->sector + rdev->new_data_offset);elsebi->bi_sector = (sh->sector + rdev->data_offset);if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))bi->bi_rw |= REQ_FLUSH;bi->bi_vcnt = 1;bi->bi_io_vec[0].bv_len = STRIPE_SIZE;bi->bi_io_vec[0].bv_offset = 0;bi->bi_size = STRIPE_SIZE;if (rrdev)set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);/* 調用generic_make_request向底層塊設備提交請求 */generic_make_request(bi);} }}

推薦閱讀