076409455b603582f589ea8fcc68173d421ca33a
[muen/linux.git] / drivers / md / raid5.c
1 /*
2  * raid5.c : Multiple Devices driver for Linux
3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4  *         Copyright (C) 1999, 2000 Ingo Molnar
5  *         Copyright (C) 2002, 2003 H. Peter Anvin
6  *
7  * RAID-4/5/6 management functions.
8  * Thanks to Penguin Computing for making the RAID-6 development possible
9  * by donating a test server!
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20
21 /*
22  * BITMAP UNPLUGGING:
23  *
24  * The sequencing for updating the bitmap reliably is a little
25  * subtle (and I got it wrong the first time) so it deserves some
26  * explanation.
27  *
28  * We group bitmap updates into batches.  Each batch has a number.
29  * We may write out several batches at once, but that isn't very important.
30  * conf->seq_write is the number of the last batch successfully written.
31  * conf->seq_flush is the number of the last batch that was closed to
32  *    new additions.
33  * When we discover that we will need to write to any block in a stripe
34  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35  * the number of the batch it will be in. This is seq_flush+1.
36  * When we are ready to do a write, if that batch hasn't been written yet,
37  *   we plug the array and queue the stripe for later.
38  * When an unplug happens, we increment bm_flush, thus closing the current
39  *   batch.
40  * When we notice that bm_flush > bm_write, we write out all pending updates
41  * to the bitmap, and advance bm_write to where bm_flush was.
42  * This may occasionally write a bit out twice, but is sure never to
43  * miss any bits.
44  */
45
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/module.h>
51 #include <linux/async.h>
52 #include <linux/seq_file.h>
53 #include <linux/cpu.h>
54 #include <linux/slab.h>
55 #include <linux/ratelimit.h>
56 #include <linux/nodemask.h>
57 #include <linux/flex_array.h>
58 #include <linux/sched/signal.h>
59
60 #include <trace/events/block.h>
61 #include <linux/list_sort.h>
62
63 #include "md.h"
64 #include "raid5.h"
65 #include "raid0.h"
66 #include "bitmap.h"
67 #include "raid5-log.h"
68
69 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
70
71 #define cpu_to_group(cpu) cpu_to_node(cpu)
72 #define ANY_GROUP NUMA_NO_NODE
73
74 static bool devices_handle_discard_safely = false;
75 module_param(devices_handle_discard_safely, bool, 0644);
76 MODULE_PARM_DESC(devices_handle_discard_safely,
77                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
78 static struct workqueue_struct *raid5_wq;
79
80 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
81 {
82         int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
83         return &conf->stripe_hashtbl[hash];
84 }
85
86 static inline int stripe_hash_locks_hash(sector_t sect)
87 {
88         return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
89 }
90
91 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
92 {
93         spin_lock_irq(conf->hash_locks + hash);
94         spin_lock(&conf->device_lock);
95 }
96
97 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
98 {
99         spin_unlock(&conf->device_lock);
100         spin_unlock_irq(conf->hash_locks + hash);
101 }
102
103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
104 {
105         int i;
106         spin_lock_irq(conf->hash_locks);
107         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
108                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
109         spin_lock(&conf->device_lock);
110 }
111
112 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
113 {
114         int i;
115         spin_unlock(&conf->device_lock);
116         for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
117                 spin_unlock(conf->hash_locks + i);
118         spin_unlock_irq(conf->hash_locks);
119 }
120
121 /* Find first data disk in a raid6 stripe */
122 static inline int raid6_d0(struct stripe_head *sh)
123 {
124         if (sh->ddf_layout)
125                 /* ddf always start from first device */
126                 return 0;
127         /* md starts just after Q block */
128         if (sh->qd_idx == sh->disks - 1)
129                 return 0;
130         else
131                 return sh->qd_idx + 1;
132 }
133 static inline int raid6_next_disk(int disk, int raid_disks)
134 {
135         disk++;
136         return (disk < raid_disks) ? disk : 0;
137 }
138
139 /* When walking through the disks in a raid5, starting at raid6_d0,
140  * We need to map each disk to a 'slot', where the data disks are slot
141  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
142  * is raid_disks-1.  This help does that mapping.
143  */
144 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
145                              int *count, int syndrome_disks)
146 {
147         int slot = *count;
148
149         if (sh->ddf_layout)
150                 (*count)++;
151         if (idx == sh->pd_idx)
152                 return syndrome_disks;
153         if (idx == sh->qd_idx)
154                 return syndrome_disks + 1;
155         if (!sh->ddf_layout)
156                 (*count)++;
157         return slot;
158 }
159
160 static void print_raid5_conf (struct r5conf *conf);
161
162 static int stripe_operations_active(struct stripe_head *sh)
163 {
164         return sh->check_state || sh->reconstruct_state ||
165                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
166                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
167 }
168
169 static bool stripe_is_lowprio(struct stripe_head *sh)
170 {
171         return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
172                 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
173                !test_bit(STRIPE_R5C_CACHING, &sh->state);
174 }
175
176 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
177 {
178         struct r5conf *conf = sh->raid_conf;
179         struct r5worker_group *group;
180         int thread_cnt;
181         int i, cpu = sh->cpu;
182
183         if (!cpu_online(cpu)) {
184                 cpu = cpumask_any(cpu_online_mask);
185                 sh->cpu = cpu;
186         }
187
188         if (list_empty(&sh->lru)) {
189                 struct r5worker_group *group;
190                 group = conf->worker_groups + cpu_to_group(cpu);
191                 if (stripe_is_lowprio(sh))
192                         list_add_tail(&sh->lru, &group->loprio_list);
193                 else
194                         list_add_tail(&sh->lru, &group->handle_list);
195                 group->stripes_cnt++;
196                 sh->group = group;
197         }
198
199         if (conf->worker_cnt_per_group == 0) {
200                 md_wakeup_thread(conf->mddev->thread);
201                 return;
202         }
203
204         group = conf->worker_groups + cpu_to_group(sh->cpu);
205
206         group->workers[0].working = true;
207         /* at least one worker should run to avoid race */
208         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
209
210         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
211         /* wakeup more workers */
212         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
213                 if (group->workers[i].working == false) {
214                         group->workers[i].working = true;
215                         queue_work_on(sh->cpu, raid5_wq,
216                                       &group->workers[i].work);
217                         thread_cnt--;
218                 }
219         }
220 }
221
222 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
223                               struct list_head *temp_inactive_list)
224 {
225         int i;
226         int injournal = 0;      /* number of date pages with R5_InJournal */
227
228         BUG_ON(!list_empty(&sh->lru));
229         BUG_ON(atomic_read(&conf->active_stripes)==0);
230
231         if (r5c_is_writeback(conf->log))
232                 for (i = sh->disks; i--; )
233                         if (test_bit(R5_InJournal, &sh->dev[i].flags))
234                                 injournal++;
235         /*
236          * In the following cases, the stripe cannot be released to cached
237          * lists. Therefore, we make the stripe write out and set
238          * STRIPE_HANDLE:
239          *   1. when quiesce in r5c write back;
240          *   2. when resync is requested fot the stripe.
241          */
242         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
243             (conf->quiesce && r5c_is_writeback(conf->log) &&
244              !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
245                 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
246                         r5c_make_stripe_write_out(sh);
247                 set_bit(STRIPE_HANDLE, &sh->state);
248         }
249
250         if (test_bit(STRIPE_HANDLE, &sh->state)) {
251                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
252                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
253                         list_add_tail(&sh->lru, &conf->delayed_list);
254                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
255                            sh->bm_seq - conf->seq_write > 0)
256                         list_add_tail(&sh->lru, &conf->bitmap_list);
257                 else {
258                         clear_bit(STRIPE_DELAYED, &sh->state);
259                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
260                         if (conf->worker_cnt_per_group == 0) {
261                                 if (stripe_is_lowprio(sh))
262                                         list_add_tail(&sh->lru,
263                                                         &conf->loprio_list);
264                                 else
265                                         list_add_tail(&sh->lru,
266                                                         &conf->handle_list);
267                         } else {
268                                 raid5_wakeup_stripe_thread(sh);
269                                 return;
270                         }
271                 }
272                 md_wakeup_thread(conf->mddev->thread);
273         } else {
274                 BUG_ON(stripe_operations_active(sh));
275                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
276                         if (atomic_dec_return(&conf->preread_active_stripes)
277                             < IO_THRESHOLD)
278                                 md_wakeup_thread(conf->mddev->thread);
279                 atomic_dec(&conf->active_stripes);
280                 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
281                         if (!r5c_is_writeback(conf->log))
282                                 list_add_tail(&sh->lru, temp_inactive_list);
283                         else {
284                                 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
285                                 if (injournal == 0)
286                                         list_add_tail(&sh->lru, temp_inactive_list);
287                                 else if (injournal == conf->raid_disks - conf->max_degraded) {
288                                         /* full stripe */
289                                         if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
290                                                 atomic_inc(&conf->r5c_cached_full_stripes);
291                                         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
292                                                 atomic_dec(&conf->r5c_cached_partial_stripes);
293                                         list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
294                                         r5c_check_cached_full_stripe(conf);
295                                 } else
296                                         /*
297                                          * STRIPE_R5C_PARTIAL_STRIPE is set in
298                                          * r5c_try_caching_write(). No need to
299                                          * set it again.
300                                          */
301                                         list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
302                         }
303                 }
304         }
305 }
306
307 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
308                              struct list_head *temp_inactive_list)
309 {
310         if (atomic_dec_and_test(&sh->count))
311                 do_release_stripe(conf, sh, temp_inactive_list);
312 }
313
314 /*
315  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
316  *
317  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
318  * given time. Adding stripes only takes device lock, while deleting stripes
319  * only takes hash lock.
320  */
321 static void release_inactive_stripe_list(struct r5conf *conf,
322                                          struct list_head *temp_inactive_list,
323                                          int hash)
324 {
325         int size;
326         bool do_wakeup = false;
327         unsigned long flags;
328
329         if (hash == NR_STRIPE_HASH_LOCKS) {
330                 size = NR_STRIPE_HASH_LOCKS;
331                 hash = NR_STRIPE_HASH_LOCKS - 1;
332         } else
333                 size = 1;
334         while (size) {
335                 struct list_head *list = &temp_inactive_list[size - 1];
336
337                 /*
338                  * We don't hold any lock here yet, raid5_get_active_stripe() might
339                  * remove stripes from the list
340                  */
341                 if (!list_empty_careful(list)) {
342                         spin_lock_irqsave(conf->hash_locks + hash, flags);
343                         if (list_empty(conf->inactive_list + hash) &&
344                             !list_empty(list))
345                                 atomic_dec(&conf->empty_inactive_list_nr);
346                         list_splice_tail_init(list, conf->inactive_list + hash);
347                         do_wakeup = true;
348                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
349                 }
350                 size--;
351                 hash--;
352         }
353
354         if (do_wakeup) {
355                 wake_up(&conf->wait_for_stripe);
356                 if (atomic_read(&conf->active_stripes) == 0)
357                         wake_up(&conf->wait_for_quiescent);
358                 if (conf->retry_read_aligned)
359                         md_wakeup_thread(conf->mddev->thread);
360         }
361 }
362
363 /* should hold conf->device_lock already */
364 static int release_stripe_list(struct r5conf *conf,
365                                struct list_head *temp_inactive_list)
366 {
367         struct stripe_head *sh, *t;
368         int count = 0;
369         struct llist_node *head;
370
371         head = llist_del_all(&conf->released_stripes);
372         head = llist_reverse_order(head);
373         llist_for_each_entry_safe(sh, t, head, release_list) {
374                 int hash;
375
376                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
377                 smp_mb();
378                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
379                 /*
380                  * Don't worry the bit is set here, because if the bit is set
381                  * again, the count is always > 1. This is true for
382                  * STRIPE_ON_UNPLUG_LIST bit too.
383                  */
384                 hash = sh->hash_lock_index;
385                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
386                 count++;
387         }
388
389         return count;
390 }
391
392 void raid5_release_stripe(struct stripe_head *sh)
393 {
394         struct r5conf *conf = sh->raid_conf;
395         unsigned long flags;
396         struct list_head list;
397         int hash;
398         bool wakeup;
399
400         /* Avoid release_list until the last reference.
401          */
402         if (atomic_add_unless(&sh->count, -1, 1))
403                 return;
404
405         if (unlikely(!conf->mddev->thread) ||
406                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
407                 goto slow_path;
408         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
409         if (wakeup)
410                 md_wakeup_thread(conf->mddev->thread);
411         return;
412 slow_path:
413         local_irq_save(flags);
414         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
415         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
416                 INIT_LIST_HEAD(&list);
417                 hash = sh->hash_lock_index;
418                 do_release_stripe(conf, sh, &list);
419                 spin_unlock(&conf->device_lock);
420                 release_inactive_stripe_list(conf, &list, hash);
421         }
422         local_irq_restore(flags);
423 }
424
425 static inline void remove_hash(struct stripe_head *sh)
426 {
427         pr_debug("remove_hash(), stripe %llu\n",
428                 (unsigned long long)sh->sector);
429
430         hlist_del_init(&sh->hash);
431 }
432
433 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
434 {
435         struct hlist_head *hp = stripe_hash(conf, sh->sector);
436
437         pr_debug("insert_hash(), stripe %llu\n",
438                 (unsigned long long)sh->sector);
439
440         hlist_add_head(&sh->hash, hp);
441 }
442
443 /* find an idle stripe, make sure it is unhashed, and return it. */
444 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
445 {
446         struct stripe_head *sh = NULL;
447         struct list_head *first;
448
449         if (list_empty(conf->inactive_list + hash))
450                 goto out;
451         first = (conf->inactive_list + hash)->next;
452         sh = list_entry(first, struct stripe_head, lru);
453         list_del_init(first);
454         remove_hash(sh);
455         atomic_inc(&conf->active_stripes);
456         BUG_ON(hash != sh->hash_lock_index);
457         if (list_empty(conf->inactive_list + hash))
458                 atomic_inc(&conf->empty_inactive_list_nr);
459 out:
460         return sh;
461 }
462
463 static void shrink_buffers(struct stripe_head *sh)
464 {
465         struct page *p;
466         int i;
467         int num = sh->raid_conf->pool_size;
468
469         for (i = 0; i < num ; i++) {
470                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
471                 p = sh->dev[i].page;
472                 if (!p)
473                         continue;
474                 sh->dev[i].page = NULL;
475                 put_page(p);
476         }
477 }
478
479 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
480 {
481         int i;
482         int num = sh->raid_conf->pool_size;
483
484         for (i = 0; i < num; i++) {
485                 struct page *page;
486
487                 if (!(page = alloc_page(gfp))) {
488                         return 1;
489                 }
490                 sh->dev[i].page = page;
491                 sh->dev[i].orig_page = page;
492         }
493
494         return 0;
495 }
496
497 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
498                             struct stripe_head *sh);
499
500 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
501 {
502         struct r5conf *conf = sh->raid_conf;
503         int i, seq;
504
505         BUG_ON(atomic_read(&sh->count) != 0);
506         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
507         BUG_ON(stripe_operations_active(sh));
508         BUG_ON(sh->batch_head);
509
510         pr_debug("init_stripe called, stripe %llu\n",
511                 (unsigned long long)sector);
512 retry:
513         seq = read_seqcount_begin(&conf->gen_lock);
514         sh->generation = conf->generation - previous;
515         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
516         sh->sector = sector;
517         stripe_set_idx(sector, conf, previous, sh);
518         sh->state = 0;
519
520         for (i = sh->disks; i--; ) {
521                 struct r5dev *dev = &sh->dev[i];
522
523                 if (dev->toread || dev->read || dev->towrite || dev->written ||
524                     test_bit(R5_LOCKED, &dev->flags)) {
525                         pr_err("sector=%llx i=%d %p %p %p %p %d\n",
526                                (unsigned long long)sh->sector, i, dev->toread,
527                                dev->read, dev->towrite, dev->written,
528                                test_bit(R5_LOCKED, &dev->flags));
529                         WARN_ON(1);
530                 }
531                 dev->flags = 0;
532                 dev->sector = raid5_compute_blocknr(sh, i, previous);
533         }
534         if (read_seqcount_retry(&conf->gen_lock, seq))
535                 goto retry;
536         sh->overwrite_disks = 0;
537         insert_hash(conf, sh);
538         sh->cpu = smp_processor_id();
539         set_bit(STRIPE_BATCH_READY, &sh->state);
540 }
541
542 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
543                                          short generation)
544 {
545         struct stripe_head *sh;
546
547         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
548         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
549                 if (sh->sector == sector && sh->generation == generation)
550                         return sh;
551         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
552         return NULL;
553 }
554
555 /*
556  * Need to check if array has failed when deciding whether to:
557  *  - start an array
558  *  - remove non-faulty devices
559  *  - add a spare
560  *  - allow a reshape
561  * This determination is simple when no reshape is happening.
562  * However if there is a reshape, we need to carefully check
563  * both the before and after sections.
564  * This is because some failed devices may only affect one
565  * of the two sections, and some non-in_sync devices may
566  * be insync in the section most affected by failed devices.
567  */
568 int raid5_calc_degraded(struct r5conf *conf)
569 {
570         int degraded, degraded2;
571         int i;
572
573         rcu_read_lock();
574         degraded = 0;
575         for (i = 0; i < conf->previous_raid_disks; i++) {
576                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
577                 if (rdev && test_bit(Faulty, &rdev->flags))
578                         rdev = rcu_dereference(conf->disks[i].replacement);
579                 if (!rdev || test_bit(Faulty, &rdev->flags))
580                         degraded++;
581                 else if (test_bit(In_sync, &rdev->flags))
582                         ;
583                 else
584                         /* not in-sync or faulty.
585                          * If the reshape increases the number of devices,
586                          * this is being recovered by the reshape, so
587                          * this 'previous' section is not in_sync.
588                          * If the number of devices is being reduced however,
589                          * the device can only be part of the array if
590                          * we are reverting a reshape, so this section will
591                          * be in-sync.
592                          */
593                         if (conf->raid_disks >= conf->previous_raid_disks)
594                                 degraded++;
595         }
596         rcu_read_unlock();
597         if (conf->raid_disks == conf->previous_raid_disks)
598                 return degraded;
599         rcu_read_lock();
600         degraded2 = 0;
601         for (i = 0; i < conf->raid_disks; i++) {
602                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
603                 if (rdev && test_bit(Faulty, &rdev->flags))
604                         rdev = rcu_dereference(conf->disks[i].replacement);
605                 if (!rdev || test_bit(Faulty, &rdev->flags))
606                         degraded2++;
607                 else if (test_bit(In_sync, &rdev->flags))
608                         ;
609                 else
610                         /* not in-sync or faulty.
611                          * If reshape increases the number of devices, this
612                          * section has already been recovered, else it
613                          * almost certainly hasn't.
614                          */
615                         if (conf->raid_disks <= conf->previous_raid_disks)
616                                 degraded2++;
617         }
618         rcu_read_unlock();
619         if (degraded2 > degraded)
620                 return degraded2;
621         return degraded;
622 }
623
624 static int has_failed(struct r5conf *conf)
625 {
626         int degraded;
627
628         if (conf->mddev->reshape_position == MaxSector)
629                 return conf->mddev->degraded > conf->max_degraded;
630
631         degraded = raid5_calc_degraded(conf);
632         if (degraded > conf->max_degraded)
633                 return 1;
634         return 0;
635 }
636
637 struct stripe_head *
638 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
639                         int previous, int noblock, int noquiesce)
640 {
641         struct stripe_head *sh;
642         int hash = stripe_hash_locks_hash(sector);
643         int inc_empty_inactive_list_flag;
644
645         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
646
647         spin_lock_irq(conf->hash_locks + hash);
648
649         do {
650                 wait_event_lock_irq(conf->wait_for_quiescent,
651                                     conf->quiesce == 0 || noquiesce,
652                                     *(conf->hash_locks + hash));
653                 sh = __find_stripe(conf, sector, conf->generation - previous);
654                 if (!sh) {
655                         if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
656                                 sh = get_free_stripe(conf, hash);
657                                 if (!sh && !test_bit(R5_DID_ALLOC,
658                                                      &conf->cache_state))
659                                         set_bit(R5_ALLOC_MORE,
660                                                 &conf->cache_state);
661                         }
662                         if (noblock && sh == NULL)
663                                 break;
664
665                         r5c_check_stripe_cache_usage(conf);
666                         if (!sh) {
667                                 set_bit(R5_INACTIVE_BLOCKED,
668                                         &conf->cache_state);
669                                 r5l_wake_reclaim(conf->log, 0);
670                                 wait_event_lock_irq(
671                                         conf->wait_for_stripe,
672                                         !list_empty(conf->inactive_list + hash) &&
673                                         (atomic_read(&conf->active_stripes)
674                                          < (conf->max_nr_stripes * 3 / 4)
675                                          || !test_bit(R5_INACTIVE_BLOCKED,
676                                                       &conf->cache_state)),
677                                         *(conf->hash_locks + hash));
678                                 clear_bit(R5_INACTIVE_BLOCKED,
679                                           &conf->cache_state);
680                         } else {
681                                 init_stripe(sh, sector, previous);
682                                 atomic_inc(&sh->count);
683                         }
684                 } else if (!atomic_inc_not_zero(&sh->count)) {
685                         spin_lock(&conf->device_lock);
686                         if (!atomic_read(&sh->count)) {
687                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
688                                         atomic_inc(&conf->active_stripes);
689                                 BUG_ON(list_empty(&sh->lru) &&
690                                        !test_bit(STRIPE_EXPANDING, &sh->state));
691                                 inc_empty_inactive_list_flag = 0;
692                                 if (!list_empty(conf->inactive_list + hash))
693                                         inc_empty_inactive_list_flag = 1;
694                                 list_del_init(&sh->lru);
695                                 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
696                                         atomic_inc(&conf->empty_inactive_list_nr);
697                                 if (sh->group) {
698                                         sh->group->stripes_cnt--;
699                                         sh->group = NULL;
700                                 }
701                         }
702                         atomic_inc(&sh->count);
703                         spin_unlock(&conf->device_lock);
704                 }
705         } while (sh == NULL);
706
707         spin_unlock_irq(conf->hash_locks + hash);
708         return sh;
709 }
710
711 static bool is_full_stripe_write(struct stripe_head *sh)
712 {
713         BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
714         return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
715 }
716
717 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
718 {
719         if (sh1 > sh2) {
720                 spin_lock_irq(&sh2->stripe_lock);
721                 spin_lock_nested(&sh1->stripe_lock, 1);
722         } else {
723                 spin_lock_irq(&sh1->stripe_lock);
724                 spin_lock_nested(&sh2->stripe_lock, 1);
725         }
726 }
727
728 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
729 {
730         spin_unlock(&sh1->stripe_lock);
731         spin_unlock_irq(&sh2->stripe_lock);
732 }
733
734 /* Only freshly new full stripe normal write stripe can be added to a batch list */
735 static bool stripe_can_batch(struct stripe_head *sh)
736 {
737         struct r5conf *conf = sh->raid_conf;
738
739         if (conf->log || raid5_has_ppl(conf))
740                 return false;
741         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
742                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
743                 is_full_stripe_write(sh);
744 }
745
746 /* we only do back search */
747 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
748 {
749         struct stripe_head *head;
750         sector_t head_sector, tmp_sec;
751         int hash;
752         int dd_idx;
753         int inc_empty_inactive_list_flag;
754
755         /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
756         tmp_sec = sh->sector;
757         if (!sector_div(tmp_sec, conf->chunk_sectors))
758                 return;
759         head_sector = sh->sector - STRIPE_SECTORS;
760
761         hash = stripe_hash_locks_hash(head_sector);
762         spin_lock_irq(conf->hash_locks + hash);
763         head = __find_stripe(conf, head_sector, conf->generation);
764         if (head && !atomic_inc_not_zero(&head->count)) {
765                 spin_lock(&conf->device_lock);
766                 if (!atomic_read(&head->count)) {
767                         if (!test_bit(STRIPE_HANDLE, &head->state))
768                                 atomic_inc(&conf->active_stripes);
769                         BUG_ON(list_empty(&head->lru) &&
770                                !test_bit(STRIPE_EXPANDING, &head->state));
771                         inc_empty_inactive_list_flag = 0;
772                         if (!list_empty(conf->inactive_list + hash))
773                                 inc_empty_inactive_list_flag = 1;
774                         list_del_init(&head->lru);
775                         if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
776                                 atomic_inc(&conf->empty_inactive_list_nr);
777                         if (head->group) {
778                                 head->group->stripes_cnt--;
779                                 head->group = NULL;
780                         }
781                 }
782                 atomic_inc(&head->count);
783                 spin_unlock(&conf->device_lock);
784         }
785         spin_unlock_irq(conf->hash_locks + hash);
786
787         if (!head)
788                 return;
789         if (!stripe_can_batch(head))
790                 goto out;
791
792         lock_two_stripes(head, sh);
793         /* clear_batch_ready clear the flag */
794         if (!stripe_can_batch(head) || !stripe_can_batch(sh))
795                 goto unlock_out;
796
797         if (sh->batch_head)
798                 goto unlock_out;
799
800         dd_idx = 0;
801         while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
802                 dd_idx++;
803         if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
804             bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
805                 goto unlock_out;
806
807         if (head->batch_head) {
808                 spin_lock(&head->batch_head->batch_lock);
809                 /* This batch list is already running */
810                 if (!stripe_can_batch(head)) {
811                         spin_unlock(&head->batch_head->batch_lock);
812                         goto unlock_out;
813                 }
814                 /*
815                  * We must assign batch_head of this stripe within the
816                  * batch_lock, otherwise clear_batch_ready of batch head
817                  * stripe could clear BATCH_READY bit of this stripe and
818                  * this stripe->batch_head doesn't get assigned, which
819                  * could confuse clear_batch_ready for this stripe
820                  */
821                 sh->batch_head = head->batch_head;
822
823                 /*
824                  * at this point, head's BATCH_READY could be cleared, but we
825                  * can still add the stripe to batch list
826                  */
827                 list_add(&sh->batch_list, &head->batch_list);
828                 spin_unlock(&head->batch_head->batch_lock);
829         } else {
830                 head->batch_head = head;
831                 sh->batch_head = head->batch_head;
832                 spin_lock(&head->batch_lock);
833                 list_add_tail(&sh->batch_list, &head->batch_list);
834                 spin_unlock(&head->batch_lock);
835         }
836
837         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
838                 if (atomic_dec_return(&conf->preread_active_stripes)
839                     < IO_THRESHOLD)
840                         md_wakeup_thread(conf->mddev->thread);
841
842         if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
843                 int seq = sh->bm_seq;
844                 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
845                     sh->batch_head->bm_seq > seq)
846                         seq = sh->batch_head->bm_seq;
847                 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
848                 sh->batch_head->bm_seq = seq;
849         }
850
851         atomic_inc(&sh->count);
852 unlock_out:
853         unlock_two_stripes(head, sh);
854 out:
855         raid5_release_stripe(head);
856 }
857
858 /* Determine if 'data_offset' or 'new_data_offset' should be used
859  * in this stripe_head.
860  */
861 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
862 {
863         sector_t progress = conf->reshape_progress;
864         /* Need a memory barrier to make sure we see the value
865          * of conf->generation, or ->data_offset that was set before
866          * reshape_progress was updated.
867          */
868         smp_rmb();
869         if (progress == MaxSector)
870                 return 0;
871         if (sh->generation == conf->generation - 1)
872                 return 0;
873         /* We are in a reshape, and this is a new-generation stripe,
874          * so use new_data_offset.
875          */
876         return 1;
877 }
878
879 static void dispatch_bio_list(struct bio_list *tmp)
880 {
881         struct bio *bio;
882
883         while ((bio = bio_list_pop(tmp)))
884                 generic_make_request(bio);
885 }
886
887 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
888 {
889         const struct r5pending_data *da = list_entry(a,
890                                 struct r5pending_data, sibling);
891         const struct r5pending_data *db = list_entry(b,
892                                 struct r5pending_data, sibling);
893         if (da->sector > db->sector)
894                 return 1;
895         if (da->sector < db->sector)
896                 return -1;
897         return 0;
898 }
899
900 static void dispatch_defer_bios(struct r5conf *conf, int target,
901                                 struct bio_list *list)
902 {
903         struct r5pending_data *data;
904         struct list_head *first, *next = NULL;
905         int cnt = 0;
906
907         if (conf->pending_data_cnt == 0)
908                 return;
909
910         list_sort(NULL, &conf->pending_list, cmp_stripe);
911
912         first = conf->pending_list.next;
913
914         /* temporarily move the head */
915         if (conf->next_pending_data)
916                 list_move_tail(&conf->pending_list,
917                                 &conf->next_pending_data->sibling);
918
919         while (!list_empty(&conf->pending_list)) {
920                 data = list_first_entry(&conf->pending_list,
921                         struct r5pending_data, sibling);
922                 if (&data->sibling == first)
923                         first = data->sibling.next;
924                 next = data->sibling.next;
925
926                 bio_list_merge(list, &data->bios);
927                 list_move(&data->sibling, &conf->free_list);
928                 cnt++;
929                 if (cnt >= target)
930                         break;
931         }
932         conf->pending_data_cnt -= cnt;
933         BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
934
935         if (next != &conf->pending_list)
936                 conf->next_pending_data = list_entry(next,
937                                 struct r5pending_data, sibling);
938         else
939                 conf->next_pending_data = NULL;
940         /* list isn't empty */
941         if (first != &conf->pending_list)
942                 list_move_tail(&conf->pending_list, first);
943 }
944
945 static void flush_deferred_bios(struct r5conf *conf)
946 {
947         struct bio_list tmp = BIO_EMPTY_LIST;
948
949         if (conf->pending_data_cnt == 0)
950                 return;
951
952         spin_lock(&conf->pending_bios_lock);
953         dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
954         BUG_ON(conf->pending_data_cnt != 0);
955         spin_unlock(&conf->pending_bios_lock);
956
957         dispatch_bio_list(&tmp);
958 }
959
960 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
961                                 struct bio_list *bios)
962 {
963         struct bio_list tmp = BIO_EMPTY_LIST;
964         struct r5pending_data *ent;
965
966         spin_lock(&conf->pending_bios_lock);
967         ent = list_first_entry(&conf->free_list, struct r5pending_data,
968                                                         sibling);
969         list_move_tail(&ent->sibling, &conf->pending_list);
970         ent->sector = sector;
971         bio_list_init(&ent->bios);
972         bio_list_merge(&ent->bios, bios);
973         conf->pending_data_cnt++;
974         if (conf->pending_data_cnt >= PENDING_IO_MAX)
975                 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
976
977         spin_unlock(&conf->pending_bios_lock);
978
979         dispatch_bio_list(&tmp);
980 }
981
982 static void
983 raid5_end_read_request(struct bio *bi);
984 static void
985 raid5_end_write_request(struct bio *bi);
986
987 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
988 {
989         struct r5conf *conf = sh->raid_conf;
990         int i, disks = sh->disks;
991         struct stripe_head *head_sh = sh;
992         struct bio_list pending_bios = BIO_EMPTY_LIST;
993         bool should_defer;
994
995         might_sleep();
996
997         if (log_stripe(sh, s) == 0)
998                 return;
999
1000         should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1001
1002         for (i = disks; i--; ) {
1003                 int op, op_flags = 0;
1004                 int replace_only = 0;
1005                 struct bio *bi, *rbi;
1006                 struct md_rdev *rdev, *rrdev = NULL;
1007
1008                 sh = head_sh;
1009                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1010                         op = REQ_OP_WRITE;
1011                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1012                                 op_flags = REQ_FUA;
1013                         if (test_bit(R5_Discard, &sh->dev[i].flags))
1014                                 op = REQ_OP_DISCARD;
1015                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1016                         op = REQ_OP_READ;
1017                 else if (test_and_clear_bit(R5_WantReplace,
1018                                             &sh->dev[i].flags)) {
1019                         op = REQ_OP_WRITE;
1020                         replace_only = 1;
1021                 } else
1022                         continue;
1023                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1024                         op_flags |= REQ_SYNC;
1025
1026 again:
1027                 bi = &sh->dev[i].req;
1028                 rbi = &sh->dev[i].rreq; /* For writing to replacement */
1029
1030                 rcu_read_lock();
1031                 rrdev = rcu_dereference(conf->disks[i].replacement);
1032                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
1033                 rdev = rcu_dereference(conf->disks[i].rdev);
1034                 if (!rdev) {
1035                         rdev = rrdev;
1036                         rrdev = NULL;
1037                 }
1038                 if (op_is_write(op)) {
1039                         if (replace_only)
1040                                 rdev = NULL;
1041                         if (rdev == rrdev)
1042                                 /* We raced and saw duplicates */
1043                                 rrdev = NULL;
1044                 } else {
1045                         if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1046                                 rdev = rrdev;
1047                         rrdev = NULL;
1048                 }
1049
1050                 if (rdev && test_bit(Faulty, &rdev->flags))
1051                         rdev = NULL;
1052                 if (rdev)
1053                         atomic_inc(&rdev->nr_pending);
1054                 if (rrdev && test_bit(Faulty, &rrdev->flags))
1055                         rrdev = NULL;
1056                 if (rrdev)
1057                         atomic_inc(&rrdev->nr_pending);
1058                 rcu_read_unlock();
1059
1060                 /* We have already checked bad blocks for reads.  Now
1061                  * need to check for writes.  We never accept write errors
1062                  * on the replacement, so we don't to check rrdev.
1063                  */
1064                 while (op_is_write(op) && rdev &&
1065                        test_bit(WriteErrorSeen, &rdev->flags)) {
1066                         sector_t first_bad;
1067                         int bad_sectors;
1068                         int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1069                                               &first_bad, &bad_sectors);
1070                         if (!bad)
1071                                 break;
1072
1073                         if (bad < 0) {
1074                                 set_bit(BlockedBadBlocks, &rdev->flags);
1075                                 if (!conf->mddev->external &&
1076                                     conf->mddev->sb_flags) {
1077                                         /* It is very unlikely, but we might
1078                                          * still need to write out the
1079                                          * bad block log - better give it
1080                                          * a chance*/
1081                                         md_check_recovery(conf->mddev);
1082                                 }
1083                                 /*
1084                                  * Because md_wait_for_blocked_rdev
1085                                  * will dec nr_pending, we must
1086                                  * increment it first.
1087                                  */
1088                                 atomic_inc(&rdev->nr_pending);
1089                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
1090                         } else {
1091                                 /* Acknowledged bad block - skip the write */
1092                                 rdev_dec_pending(rdev, conf->mddev);
1093                                 rdev = NULL;
1094                         }
1095                 }
1096
1097                 if (rdev) {
1098                         if (s->syncing || s->expanding || s->expanded
1099                             || s->replacing)
1100                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1101
1102                         set_bit(STRIPE_IO_STARTED, &sh->state);
1103
1104                         bio_set_dev(bi, rdev->bdev);
1105                         bio_set_op_attrs(bi, op, op_flags);
1106                         bi->bi_end_io = op_is_write(op)
1107                                 ? raid5_end_write_request
1108                                 : raid5_end_read_request;
1109                         bi->bi_private = sh;
1110
1111                         pr_debug("%s: for %llu schedule op %d on disc %d\n",
1112                                 __func__, (unsigned long long)sh->sector,
1113                                 bi->bi_opf, i);
1114                         atomic_inc(&sh->count);
1115                         if (sh != head_sh)
1116                                 atomic_inc(&head_sh->count);
1117                         if (use_new_offset(conf, sh))
1118                                 bi->bi_iter.bi_sector = (sh->sector
1119                                                  + rdev->new_data_offset);
1120                         else
1121                                 bi->bi_iter.bi_sector = (sh->sector
1122                                                  + rdev->data_offset);
1123                         if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1124                                 bi->bi_opf |= REQ_NOMERGE;
1125
1126                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1127                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1128
1129                         if (!op_is_write(op) &&
1130                             test_bit(R5_InJournal, &sh->dev[i].flags))
1131                                 /*
1132                                  * issuing read for a page in journal, this
1133                                  * must be preparing for prexor in rmw; read
1134                                  * the data into orig_page
1135                                  */
1136                                 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1137                         else
1138                                 sh->dev[i].vec.bv_page = sh->dev[i].page;
1139                         bi->bi_vcnt = 1;
1140                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1141                         bi->bi_io_vec[0].bv_offset = 0;
1142                         bi->bi_iter.bi_size = STRIPE_SIZE;
1143                         /*
1144                          * If this is discard request, set bi_vcnt 0. We don't
1145                          * want to confuse SCSI because SCSI will replace payload
1146                          */
1147                         if (op == REQ_OP_DISCARD)
1148                                 bi->bi_vcnt = 0;
1149                         if (rrdev)
1150                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1151
1152                         if (conf->mddev->gendisk)
1153                                 trace_block_bio_remap(bi->bi_disk->queue,
1154                                                       bi, disk_devt(conf->mddev->gendisk),
1155                                                       sh->dev[i].sector);
1156                         if (should_defer && op_is_write(op))
1157                                 bio_list_add(&pending_bios, bi);
1158                         else
1159                                 generic_make_request(bi);
1160                 }
1161                 if (rrdev) {
1162                         if (s->syncing || s->expanding || s->expanded
1163                             || s->replacing)
1164                                 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1165
1166                         set_bit(STRIPE_IO_STARTED, &sh->state);
1167
1168                         bio_set_dev(rbi, rrdev->bdev);
1169                         bio_set_op_attrs(rbi, op, op_flags);
1170                         BUG_ON(!op_is_write(op));
1171                         rbi->bi_end_io = raid5_end_write_request;
1172                         rbi->bi_private = sh;
1173
1174                         pr_debug("%s: for %llu schedule op %d on "
1175                                  "replacement disc %d\n",
1176                                 __func__, (unsigned long long)sh->sector,
1177                                 rbi->bi_opf, i);
1178                         atomic_inc(&sh->count);
1179                         if (sh != head_sh)
1180                                 atomic_inc(&head_sh->count);
1181                         if (use_new_offset(conf, sh))
1182                                 rbi->bi_iter.bi_sector = (sh->sector
1183                                                   + rrdev->new_data_offset);
1184                         else
1185                                 rbi->bi_iter.bi_sector = (sh->sector
1186                                                   + rrdev->data_offset);
1187                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1188                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1189                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
1190                         rbi->bi_vcnt = 1;
1191                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192                         rbi->bi_io_vec[0].bv_offset = 0;
1193                         rbi->bi_iter.bi_size = STRIPE_SIZE;
1194                         /*
1195                          * If this is discard request, set bi_vcnt 0. We don't
1196                          * want to confuse SCSI because SCSI will replace payload
1197                          */
1198                         if (op == REQ_OP_DISCARD)
1199                                 rbi->bi_vcnt = 0;
1200                         if (conf->mddev->gendisk)
1201                                 trace_block_bio_remap(rbi->bi_disk->queue,
1202                                                       rbi, disk_devt(conf->mddev->gendisk),
1203                                                       sh->dev[i].sector);
1204                         if (should_defer && op_is_write(op))
1205                                 bio_list_add(&pending_bios, rbi);
1206                         else
1207                                 generic_make_request(rbi);
1208                 }
1209                 if (!rdev && !rrdev) {
1210                         if (op_is_write(op))
1211                                 set_bit(STRIPE_DEGRADED, &sh->state);
1212                         pr_debug("skip op %d on disc %d for sector %llu\n",
1213                                 bi->bi_opf, i, (unsigned long long)sh->sector);
1214                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
1215                         set_bit(STRIPE_HANDLE, &sh->state);
1216                 }
1217
1218                 if (!head_sh->batch_head)
1219                         continue;
1220                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1221                                       batch_list);
1222                 if (sh != head_sh)
1223                         goto again;
1224         }
1225
1226         if (should_defer && !bio_list_empty(&pending_bios))
1227                 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1228 }
1229
1230 static struct dma_async_tx_descriptor *
1231 async_copy_data(int frombio, struct bio *bio, struct page **page,
1232         sector_t sector, struct dma_async_tx_descriptor *tx,
1233         struct stripe_head *sh, int no_skipcopy)
1234 {
1235         struct bio_vec bvl;
1236         struct bvec_iter iter;
1237         struct page *bio_page;
1238         int page_offset;
1239         struct async_submit_ctl submit;
1240         enum async_tx_flags flags = 0;
1241
1242         if (bio->bi_iter.bi_sector >= sector)
1243                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1244         else
1245                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1246
1247         if (frombio)
1248                 flags |= ASYNC_TX_FENCE;
1249         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1250
1251         bio_for_each_segment(bvl, bio, iter) {
1252                 int len = bvl.bv_len;
1253                 int clen;
1254                 int b_offset = 0;
1255
1256                 if (page_offset < 0) {
1257                         b_offset = -page_offset;
1258                         page_offset += b_offset;
1259                         len -= b_offset;
1260                 }
1261
1262                 if (len > 0 && page_offset + len > STRIPE_SIZE)
1263                         clen = STRIPE_SIZE - page_offset;
1264                 else
1265                         clen = len;
1266
1267                 if (clen > 0) {
1268                         b_offset += bvl.bv_offset;
1269                         bio_page = bvl.bv_page;
1270                         if (frombio) {
1271                                 if (sh->raid_conf->skip_copy &&
1272                                     b_offset == 0 && page_offset == 0 &&
1273                                     clen == STRIPE_SIZE &&
1274                                     !no_skipcopy)
1275                                         *page = bio_page;
1276                                 else
1277                                         tx = async_memcpy(*page, bio_page, page_offset,
1278                                                   b_offset, clen, &submit);
1279                         } else
1280                                 tx = async_memcpy(bio_page, *page, b_offset,
1281                                                   page_offset, clen, &submit);
1282                 }
1283                 /* chain the operations */
1284                 submit.depend_tx = tx;
1285
1286                 if (clen < len) /* hit end of page */
1287                         break;
1288                 page_offset +=  len;
1289         }
1290
1291         return tx;
1292 }
1293
1294 static void ops_complete_biofill(void *stripe_head_ref)
1295 {
1296         struct stripe_head *sh = stripe_head_ref;
1297         int i;
1298
1299         pr_debug("%s: stripe %llu\n", __func__,
1300                 (unsigned long long)sh->sector);
1301
1302         /* clear completed biofills */
1303         for (i = sh->disks; i--; ) {
1304                 struct r5dev *dev = &sh->dev[i];
1305
1306                 /* acknowledge completion of a biofill operation */
1307                 /* and check if we need to reply to a read request,
1308                  * new R5_Wantfill requests are held off until
1309                  * !STRIPE_BIOFILL_RUN
1310                  */
1311                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1312                         struct bio *rbi, *rbi2;
1313
1314                         BUG_ON(!dev->read);
1315                         rbi = dev->read;
1316                         dev->read = NULL;
1317                         while (rbi && rbi->bi_iter.bi_sector <
1318                                 dev->sector + STRIPE_SECTORS) {
1319                                 rbi2 = r5_next_bio(rbi, dev->sector);
1320                                 bio_endio(rbi);
1321                                 rbi = rbi2;
1322                         }
1323                 }
1324         }
1325         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1326
1327         set_bit(STRIPE_HANDLE, &sh->state);
1328         raid5_release_stripe(sh);
1329 }
1330
1331 static void ops_run_biofill(struct stripe_head *sh)
1332 {
1333         struct dma_async_tx_descriptor *tx = NULL;
1334         struct async_submit_ctl submit;
1335         int i;
1336
1337         BUG_ON(sh->batch_head);
1338         pr_debug("%s: stripe %llu\n", __func__,
1339                 (unsigned long long)sh->sector);
1340
1341         for (i = sh->disks; i--; ) {
1342                 struct r5dev *dev = &sh->dev[i];
1343                 if (test_bit(R5_Wantfill, &dev->flags)) {
1344                         struct bio *rbi;
1345                         spin_lock_irq(&sh->stripe_lock);
1346                         dev->read = rbi = dev->toread;
1347                         dev->toread = NULL;
1348                         spin_unlock_irq(&sh->stripe_lock);
1349                         while (rbi && rbi->bi_iter.bi_sector <
1350                                 dev->sector + STRIPE_SECTORS) {
1351                                 tx = async_copy_data(0, rbi, &dev->page,
1352                                                      dev->sector, tx, sh, 0);
1353                                 rbi = r5_next_bio(rbi, dev->sector);
1354                         }
1355                 }
1356         }
1357
1358         atomic_inc(&sh->count);
1359         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1360         async_trigger_callback(&submit);
1361 }
1362
1363 static void mark_target_uptodate(struct stripe_head *sh, int target)
1364 {
1365         struct r5dev *tgt;
1366
1367         if (target < 0)
1368                 return;
1369
1370         tgt = &sh->dev[target];
1371         set_bit(R5_UPTODATE, &tgt->flags);
1372         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1373         clear_bit(R5_Wantcompute, &tgt->flags);
1374 }
1375
1376 static void ops_complete_compute(void *stripe_head_ref)
1377 {
1378         struct stripe_head *sh = stripe_head_ref;
1379
1380         pr_debug("%s: stripe %llu\n", __func__,
1381                 (unsigned long long)sh->sector);
1382
1383         /* mark the computed target(s) as uptodate */
1384         mark_target_uptodate(sh, sh->ops.target);
1385         mark_target_uptodate(sh, sh->ops.target2);
1386
1387         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1388         if (sh->check_state == check_state_compute_run)
1389                 sh->check_state = check_state_compute_result;
1390         set_bit(STRIPE_HANDLE, &sh->state);
1391         raid5_release_stripe(sh);
1392 }
1393
1394 /* return a pointer to the address conversion region of the scribble buffer */
1395 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1396                                  struct raid5_percpu *percpu, int i)
1397 {
1398         void *addr;
1399
1400         addr = flex_array_get(percpu->scribble, i);
1401         return addr + sizeof(struct page *) * (sh->disks + 2);
1402 }
1403
1404 /* return a pointer to the address conversion region of the scribble buffer */
1405 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1406 {
1407         void *addr;
1408
1409         addr = flex_array_get(percpu->scribble, i);
1410         return addr;
1411 }
1412
1413 static struct dma_async_tx_descriptor *
1414 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1415 {
1416         int disks = sh->disks;
1417         struct page **xor_srcs = to_addr_page(percpu, 0);
1418         int target = sh->ops.target;
1419         struct r5dev *tgt = &sh->dev[target];
1420         struct page *xor_dest = tgt->page;
1421         int count = 0;
1422         struct dma_async_tx_descriptor *tx;
1423         struct async_submit_ctl submit;
1424         int i;
1425
1426         BUG_ON(sh->batch_head);
1427
1428         pr_debug("%s: stripe %llu block: %d\n",
1429                 __func__, (unsigned long long)sh->sector, target);
1430         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1431
1432         for (i = disks; i--; )
1433                 if (i != target)
1434                         xor_srcs[count++] = sh->dev[i].page;
1435
1436         atomic_inc(&sh->count);
1437
1438         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1439                           ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1440         if (unlikely(count == 1))
1441                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1442         else
1443                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1444
1445         return tx;
1446 }
1447
1448 /* set_syndrome_sources - populate source buffers for gen_syndrome
1449  * @srcs - (struct page *) array of size sh->disks
1450  * @sh - stripe_head to parse
1451  *
1452  * Populates srcs in proper layout order for the stripe and returns the
1453  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1454  * destination buffer is recorded in srcs[count] and the Q destination
1455  * is recorded in srcs[count+1]].
1456  */
1457 static int set_syndrome_sources(struct page **srcs,
1458                                 struct stripe_head *sh,
1459                                 int srctype)
1460 {
1461         int disks = sh->disks;
1462         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1463         int d0_idx = raid6_d0(sh);
1464         int count;
1465         int i;
1466
1467         for (i = 0; i < disks; i++)
1468                 srcs[i] = NULL;
1469
1470         count = 0;
1471         i = d0_idx;
1472         do {
1473                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1474                 struct r5dev *dev = &sh->dev[i];
1475
1476                 if (i == sh->qd_idx || i == sh->pd_idx ||
1477                     (srctype == SYNDROME_SRC_ALL) ||
1478                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
1479                      (test_bit(R5_Wantdrain, &dev->flags) ||
1480                       test_bit(R5_InJournal, &dev->flags))) ||
1481                     (srctype == SYNDROME_SRC_WRITTEN &&
1482                      (dev->written ||
1483                       test_bit(R5_InJournal, &dev->flags)))) {
1484                         if (test_bit(R5_InJournal, &dev->flags))
1485                                 srcs[slot] = sh->dev[i].orig_page;
1486                         else
1487                                 srcs[slot] = sh->dev[i].page;
1488                 }
1489                 i = raid6_next_disk(i, disks);
1490         } while (i != d0_idx);
1491
1492         return syndrome_disks;
1493 }
1494
1495 static struct dma_async_tx_descriptor *
1496 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1497 {
1498         int disks = sh->disks;
1499         struct page **blocks = to_addr_page(percpu, 0);
1500         int target;
1501         int qd_idx = sh->qd_idx;
1502         struct dma_async_tx_descriptor *tx;
1503         struct async_submit_ctl submit;
1504         struct r5dev *tgt;
1505         struct page *dest;
1506         int i;
1507         int count;
1508
1509         BUG_ON(sh->batch_head);
1510         if (sh->ops.target < 0)
1511                 target = sh->ops.target2;
1512         else if (sh->ops.target2 < 0)
1513                 target = sh->ops.target;
1514         else
1515                 /* we should only have one valid target */
1516                 BUG();
1517         BUG_ON(target < 0);
1518         pr_debug("%s: stripe %llu block: %d\n",
1519                 __func__, (unsigned long long)sh->sector, target);
1520
1521         tgt = &sh->dev[target];
1522         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1523         dest = tgt->page;
1524
1525         atomic_inc(&sh->count);
1526
1527         if (target == qd_idx) {
1528                 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1529                 blocks[count] = NULL; /* regenerating p is not necessary */
1530                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1531                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1532                                   ops_complete_compute, sh,
1533                                   to_addr_conv(sh, percpu, 0));
1534                 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1535         } else {
1536                 /* Compute any data- or p-drive using XOR */
1537                 count = 0;
1538                 for (i = disks; i-- ; ) {
1539                         if (i == target || i == qd_idx)
1540                                 continue;
1541                         blocks[count++] = sh->dev[i].page;
1542                 }
1543
1544                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1545                                   NULL, ops_complete_compute, sh,
1546                                   to_addr_conv(sh, percpu, 0));
1547                 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1548         }
1549
1550         return tx;
1551 }
1552
1553 static struct dma_async_tx_descriptor *
1554 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1555 {
1556         int i, count, disks = sh->disks;
1557         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1558         int d0_idx = raid6_d0(sh);
1559         int faila = -1, failb = -1;
1560         int target = sh->ops.target;
1561         int target2 = sh->ops.target2;
1562         struct r5dev *tgt = &sh->dev[target];
1563         struct r5dev *tgt2 = &sh->dev[target2];
1564         struct dma_async_tx_descriptor *tx;
1565         struct page **blocks = to_addr_page(percpu, 0);
1566         struct async_submit_ctl submit;
1567
1568         BUG_ON(sh->batch_head);
1569         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1570                  __func__, (unsigned long long)sh->sector, target, target2);
1571         BUG_ON(target < 0 || target2 < 0);
1572         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1573         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1574
1575         /* we need to open-code set_syndrome_sources to handle the
1576          * slot number conversion for 'faila' and 'failb'
1577          */
1578         for (i = 0; i < disks ; i++)
1579                 blocks[i] = NULL;
1580         count = 0;
1581         i = d0_idx;
1582         do {
1583                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1584
1585                 blocks[slot] = sh->dev[i].page;
1586
1587                 if (i == target)
1588                         faila = slot;
1589                 if (i == target2)
1590                         failb = slot;
1591                 i = raid6_next_disk(i, disks);
1592         } while (i != d0_idx);
1593
1594         BUG_ON(faila == failb);
1595         if (failb < faila)
1596                 swap(faila, failb);
1597         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1598                  __func__, (unsigned long long)sh->sector, faila, failb);
1599
1600         atomic_inc(&sh->count);
1601
1602         if (failb == syndrome_disks+1) {
1603                 /* Q disk is one of the missing disks */
1604                 if (faila == syndrome_disks) {
1605                         /* Missing P+Q, just recompute */
1606                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1607                                           ops_complete_compute, sh,
1608                                           to_addr_conv(sh, percpu, 0));
1609                         return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1610                                                   STRIPE_SIZE, &submit);
1611                 } else {
1612                         struct page *dest;
1613                         int data_target;
1614                         int qd_idx = sh->qd_idx;
1615
1616                         /* Missing D+Q: recompute D from P, then recompute Q */
1617                         if (target == qd_idx)
1618                                 data_target = target2;
1619                         else
1620                                 data_target = target;
1621
1622                         count = 0;
1623                         for (i = disks; i-- ; ) {
1624                                 if (i == data_target || i == qd_idx)
1625                                         continue;
1626                                 blocks[count++] = sh->dev[i].page;
1627                         }
1628                         dest = sh->dev[data_target].page;
1629                         init_async_submit(&submit,
1630                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1631                                           NULL, NULL, NULL,
1632                                           to_addr_conv(sh, percpu, 0));
1633                         tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1634                                        &submit);
1635
1636                         count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1637                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1638                                           ops_complete_compute, sh,
1639                                           to_addr_conv(sh, percpu, 0));
1640                         return async_gen_syndrome(blocks, 0, count+2,
1641                                                   STRIPE_SIZE, &submit);
1642                 }
1643         } else {
1644                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1645                                   ops_complete_compute, sh,
1646                                   to_addr_conv(sh, percpu, 0));
1647                 if (failb == syndrome_disks) {
1648                         /* We're missing D+P. */
1649                         return async_raid6_datap_recov(syndrome_disks+2,
1650                                                        STRIPE_SIZE, faila,
1651                                                        blocks, &submit);
1652                 } else {
1653                         /* We're missing D+D. */
1654                         return async_raid6_2data_recov(syndrome_disks+2,
1655                                                        STRIPE_SIZE, faila, failb,
1656                                                        blocks, &submit);
1657                 }
1658         }
1659 }
1660
1661 static void ops_complete_prexor(void *stripe_head_ref)
1662 {
1663         struct stripe_head *sh = stripe_head_ref;
1664
1665         pr_debug("%s: stripe %llu\n", __func__,
1666                 (unsigned long long)sh->sector);
1667
1668         if (r5c_is_writeback(sh->raid_conf->log))
1669                 /*
1670                  * raid5-cache write back uses orig_page during prexor.
1671                  * After prexor, it is time to free orig_page
1672                  */
1673                 r5c_release_extra_page(sh);
1674 }
1675
1676 static struct dma_async_tx_descriptor *
1677 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1678                 struct dma_async_tx_descriptor *tx)
1679 {
1680         int disks = sh->disks;
1681         struct page **xor_srcs = to_addr_page(percpu, 0);
1682         int count = 0, pd_idx = sh->pd_idx, i;
1683         struct async_submit_ctl submit;
1684
1685         /* existing parity data subtracted */
1686         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1687
1688         BUG_ON(sh->batch_head);
1689         pr_debug("%s: stripe %llu\n", __func__,
1690                 (unsigned long long)sh->sector);
1691
1692         for (i = disks; i--; ) {
1693                 struct r5dev *dev = &sh->dev[i];
1694                 /* Only process blocks that are known to be uptodate */
1695                 if (test_bit(R5_InJournal, &dev->flags))
1696                         xor_srcs[count++] = dev->orig_page;
1697                 else if (test_bit(R5_Wantdrain, &dev->flags))
1698                         xor_srcs[count++] = dev->page;
1699         }
1700
1701         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1702                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1703         tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1704
1705         return tx;
1706 }
1707
1708 static struct dma_async_tx_descriptor *
1709 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1710                 struct dma_async_tx_descriptor *tx)
1711 {
1712         struct page **blocks = to_addr_page(percpu, 0);
1713         int count;
1714         struct async_submit_ctl submit;
1715
1716         pr_debug("%s: stripe %llu\n", __func__,
1717                 (unsigned long long)sh->sector);
1718
1719         count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1720
1721         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1722                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1723         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1724
1725         return tx;
1726 }
1727
1728 static struct dma_async_tx_descriptor *
1729 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1730 {
1731         struct r5conf *conf = sh->raid_conf;
1732         int disks = sh->disks;
1733         int i;
1734         struct stripe_head *head_sh = sh;
1735
1736         pr_debug("%s: stripe %llu\n", __func__,
1737                 (unsigned long long)sh->sector);
1738
1739         for (i = disks; i--; ) {
1740                 struct r5dev *dev;
1741                 struct bio *chosen;
1742
1743                 sh = head_sh;
1744                 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1745                         struct bio *wbi;
1746
1747 again:
1748                         dev = &sh->dev[i];
1749                         /*
1750                          * clear R5_InJournal, so when rewriting a page in
1751                          * journal, it is not skipped by r5l_log_stripe()
1752                          */
1753                         clear_bit(R5_InJournal, &dev->flags);
1754                         spin_lock_irq(&sh->stripe_lock);
1755                         chosen = dev->towrite;
1756                         dev->towrite = NULL;
1757                         sh->overwrite_disks = 0;
1758                         BUG_ON(dev->written);
1759                         wbi = dev->written = chosen;
1760                         spin_unlock_irq(&sh->stripe_lock);
1761                         WARN_ON(dev->page != dev->orig_page);
1762
1763                         while (wbi && wbi->bi_iter.bi_sector <
1764                                 dev->sector + STRIPE_SECTORS) {
1765                                 if (wbi->bi_opf & REQ_FUA)
1766                                         set_bit(R5_WantFUA, &dev->flags);
1767                                 if (wbi->bi_opf & REQ_SYNC)
1768                                         set_bit(R5_SyncIO, &dev->flags);
1769                                 if (bio_op(wbi) == REQ_OP_DISCARD)
1770                                         set_bit(R5_Discard, &dev->flags);
1771                                 else {
1772                                         tx = async_copy_data(1, wbi, &dev->page,
1773                                                              dev->sector, tx, sh,
1774                                                              r5c_is_writeback(conf->log));
1775                                         if (dev->page != dev->orig_page &&
1776                                             !r5c_is_writeback(conf->log)) {
1777                                                 set_bit(R5_SkipCopy, &dev->flags);
1778                                                 clear_bit(R5_UPTODATE, &dev->flags);
1779                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1780                                         }
1781                                 }
1782                                 wbi = r5_next_bio(wbi, dev->sector);
1783                         }
1784
1785                         if (head_sh->batch_head) {
1786                                 sh = list_first_entry(&sh->batch_list,
1787                                                       struct stripe_head,
1788                                                       batch_list);
1789                                 if (sh == head_sh)
1790                                         continue;
1791                                 goto again;
1792                         }
1793                 }
1794         }
1795
1796         return tx;
1797 }
1798
1799 static void ops_complete_reconstruct(void *stripe_head_ref)
1800 {
1801         struct stripe_head *sh = stripe_head_ref;
1802         int disks = sh->disks;
1803         int pd_idx = sh->pd_idx;
1804         int qd_idx = sh->qd_idx;
1805         int i;
1806         bool fua = false, sync = false, discard = false;
1807
1808         pr_debug("%s: stripe %llu\n", __func__,
1809                 (unsigned long long)sh->sector);
1810
1811         for (i = disks; i--; ) {
1812                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1813                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1814                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1815         }
1816
1817         for (i = disks; i--; ) {
1818                 struct r5dev *dev = &sh->dev[i];
1819
1820                 if (dev->written || i == pd_idx || i == qd_idx) {
1821                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1822                                 set_bit(R5_UPTODATE, &dev->flags);
1823                         if (fua)
1824                                 set_bit(R5_WantFUA, &dev->flags);
1825                         if (sync)
1826                                 set_bit(R5_SyncIO, &dev->flags);
1827                 }
1828         }
1829
1830         if (sh->reconstruct_state == reconstruct_state_drain_run)
1831                 sh->reconstruct_state = reconstruct_state_drain_result;
1832         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1833                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1834         else {
1835                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1836                 sh->reconstruct_state = reconstruct_state_result;
1837         }
1838
1839         set_bit(STRIPE_HANDLE, &sh->state);
1840         raid5_release_stripe(sh);
1841 }
1842
1843 static void
1844 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1845                      struct dma_async_tx_descriptor *tx)
1846 {
1847         int disks = sh->disks;
1848         struct page **xor_srcs;
1849         struct async_submit_ctl submit;
1850         int count, pd_idx = sh->pd_idx, i;
1851         struct page *xor_dest;
1852         int prexor = 0;
1853         unsigned long flags;
1854         int j = 0;
1855         struct stripe_head *head_sh = sh;
1856         int last_stripe;
1857
1858         pr_debug("%s: stripe %llu\n", __func__,
1859                 (unsigned long long)sh->sector);
1860
1861         for (i = 0; i < sh->disks; i++) {
1862                 if (pd_idx == i)
1863                         continue;
1864                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1865                         break;
1866         }
1867         if (i >= sh->disks) {
1868                 atomic_inc(&sh->count);
1869                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1870                 ops_complete_reconstruct(sh);
1871                 return;
1872         }
1873 again:
1874         count = 0;
1875         xor_srcs = to_addr_page(percpu, j);
1876         /* check if prexor is active which means only process blocks
1877          * that are part of a read-modify-write (written)
1878          */
1879         if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1880                 prexor = 1;
1881                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1882                 for (i = disks; i--; ) {
1883                         struct r5dev *dev = &sh->dev[i];
1884                         if (head_sh->dev[i].written ||
1885                             test_bit(R5_InJournal, &head_sh->dev[i].flags))
1886                                 xor_srcs[count++] = dev->page;
1887                 }
1888         } else {
1889                 xor_dest = sh->dev[pd_idx].page;
1890                 for (i = disks; i--; ) {
1891                         struct r5dev *dev = &sh->dev[i];
1892                         if (i != pd_idx)
1893                                 xor_srcs[count++] = dev->page;
1894                 }
1895         }
1896
1897         /* 1/ if we prexor'd then the dest is reused as a source
1898          * 2/ if we did not prexor then we are redoing the parity
1899          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1900          * for the synchronous xor case
1901          */
1902         last_stripe = !head_sh->batch_head ||
1903                 list_first_entry(&sh->batch_list,
1904                                  struct stripe_head, batch_list) == head_sh;
1905         if (last_stripe) {
1906                 flags = ASYNC_TX_ACK |
1907                         (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1908
1909                 atomic_inc(&head_sh->count);
1910                 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1911                                   to_addr_conv(sh, percpu, j));
1912         } else {
1913                 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1914                 init_async_submit(&submit, flags, tx, NULL, NULL,
1915                                   to_addr_conv(sh, percpu, j));
1916         }
1917
1918         if (unlikely(count == 1))
1919                 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1920         else
1921                 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1922         if (!last_stripe) {
1923                 j++;
1924                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1925                                       batch_list);
1926                 goto again;
1927         }
1928 }
1929
1930 static void
1931 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1932                      struct dma_async_tx_descriptor *tx)
1933 {
1934         struct async_submit_ctl submit;
1935         struct page **blocks;
1936         int count, i, j = 0;
1937         struct stripe_head *head_sh = sh;
1938         int last_stripe;
1939         int synflags;
1940         unsigned long txflags;
1941
1942         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1943
1944         for (i = 0; i < sh->disks; i++) {
1945                 if (sh->pd_idx == i || sh->qd_idx == i)
1946                         continue;
1947                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1948                         break;
1949         }
1950         if (i >= sh->disks) {
1951                 atomic_inc(&sh->count);
1952                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1953                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1954                 ops_complete_reconstruct(sh);
1955                 return;
1956         }
1957
1958 again:
1959         blocks = to_addr_page(percpu, j);
1960
1961         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1962                 synflags = SYNDROME_SRC_WRITTEN;
1963                 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1964         } else {
1965                 synflags = SYNDROME_SRC_ALL;
1966                 txflags = ASYNC_TX_ACK;
1967         }
1968
1969         count = set_syndrome_sources(blocks, sh, synflags);
1970         last_stripe = !head_sh->batch_head ||
1971                 list_first_entry(&sh->batch_list,
1972                                  struct stripe_head, batch_list) == head_sh;
1973
1974         if (last_stripe) {
1975                 atomic_inc(&head_sh->count);
1976                 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1977                                   head_sh, to_addr_conv(sh, percpu, j));
1978         } else
1979                 init_async_submit(&submit, 0, tx, NULL, NULL,
1980                                   to_addr_conv(sh, percpu, j));
1981         tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1982         if (!last_stripe) {
1983                 j++;
1984                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1985                                       batch_list);
1986                 goto again;
1987         }
1988 }
1989
1990 static void ops_complete_check(void *stripe_head_ref)
1991 {
1992         struct stripe_head *sh = stripe_head_ref;
1993
1994         pr_debug("%s: stripe %llu\n", __func__,
1995                 (unsigned long long)sh->sector);
1996
1997         sh->check_state = check_state_check_result;
1998         set_bit(STRIPE_HANDLE, &sh->state);
1999         raid5_release_stripe(sh);
2000 }
2001
2002 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2003 {
2004         int disks = sh->disks;
2005         int pd_idx = sh->pd_idx;
2006         int qd_idx = sh->qd_idx;
2007         struct page *xor_dest;
2008         struct page **xor_srcs = to_addr_page(percpu, 0);
2009         struct dma_async_tx_descriptor *tx;
2010         struct async_submit_ctl submit;
2011         int count;
2012         int i;
2013
2014         pr_debug("%s: stripe %llu\n", __func__,
2015                 (unsigned long long)sh->sector);
2016
2017         BUG_ON(sh->batch_head);
2018         count = 0;
2019         xor_dest = sh->dev[pd_idx].page;
2020         xor_srcs[count++] = xor_dest;
2021         for (i = disks; i--; ) {
2022                 if (i == pd_idx || i == qd_idx)
2023                         continue;
2024                 xor_srcs[count++] = sh->dev[i].page;
2025         }
2026
2027         init_async_submit(&submit, 0, NULL, NULL, NULL,
2028                           to_addr_conv(sh, percpu, 0));
2029         tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2030                            &sh->ops.zero_sum_result, &submit);
2031
2032         atomic_inc(&sh->count);
2033         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2034         tx = async_trigger_callback(&submit);
2035 }
2036
2037 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2038 {
2039         struct page **srcs = to_addr_page(percpu, 0);
2040         struct async_submit_ctl submit;
2041         int count;
2042
2043         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2044                 (unsigned long long)sh->sector, checkp);
2045
2046         BUG_ON(sh->batch_head);
2047         count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2048         if (!checkp)
2049                 srcs[count] = NULL;
2050
2051         atomic_inc(&sh->count);
2052         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2053                           sh, to_addr_conv(sh, percpu, 0));
2054         async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2055                            &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2056 }
2057
2058 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2059 {
2060         int overlap_clear = 0, i, disks = sh->disks;
2061         struct dma_async_tx_descriptor *tx = NULL;
2062         struct r5conf *conf = sh->raid_conf;
2063         int level = conf->level;
2064         struct raid5_percpu *percpu;
2065         unsigned long cpu;
2066
2067         cpu = get_cpu();
2068         percpu = per_cpu_ptr(conf->percpu, cpu);
2069         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2070                 ops_run_biofill(sh);
2071                 overlap_clear++;
2072         }
2073
2074         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2075                 if (level < 6)
2076                         tx = ops_run_compute5(sh, percpu);
2077                 else {
2078                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
2079                                 tx = ops_run_compute6_1(sh, percpu);
2080                         else
2081                                 tx = ops_run_compute6_2(sh, percpu);
2082                 }
2083                 /* terminate the chain if reconstruct is not set to be run */
2084                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2085                         async_tx_ack(tx);
2086         }
2087
2088         if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2089                 if (level < 6)
2090                         tx = ops_run_prexor5(sh, percpu, tx);
2091                 else
2092                         tx = ops_run_prexor6(sh, percpu, tx);
2093         }
2094
2095         if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2096                 tx = ops_run_partial_parity(sh, percpu, tx);
2097
2098         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2099                 tx = ops_run_biodrain(sh, tx);
2100                 overlap_clear++;
2101         }
2102
2103         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2104                 if (level < 6)
2105                         ops_run_reconstruct5(sh, percpu, tx);
2106                 else
2107                         ops_run_reconstruct6(sh, percpu, tx);
2108         }
2109
2110         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2111                 if (sh->check_state == check_state_run)
2112                         ops_run_check_p(sh, percpu);
2113                 else if (sh->check_state == check_state_run_q)
2114                         ops_run_check_pq(sh, percpu, 0);
2115                 else if (sh->check_state == check_state_run_pq)
2116                         ops_run_check_pq(sh, percpu, 1);
2117                 else
2118                         BUG();
2119         }
2120
2121         if (overlap_clear && !sh->batch_head)
2122                 for (i = disks; i--; ) {
2123                         struct r5dev *dev = &sh->dev[i];
2124                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
2125                                 wake_up(&sh->raid_conf->wait_for_overlap);
2126                 }
2127         put_cpu();
2128 }
2129
2130 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2131 {
2132         if (sh->ppl_page)
2133                 __free_page(sh->ppl_page);
2134         kmem_cache_free(sc, sh);
2135 }
2136
2137 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2138         int disks, struct r5conf *conf)
2139 {
2140         struct stripe_head *sh;
2141         int i;
2142
2143         sh = kmem_cache_zalloc(sc, gfp);
2144         if (sh) {
2145                 spin_lock_init(&sh->stripe_lock);
2146                 spin_lock_init(&sh->batch_lock);
2147                 INIT_LIST_HEAD(&sh->batch_list);
2148                 INIT_LIST_HEAD(&sh->lru);
2149                 INIT_LIST_HEAD(&sh->r5c);
2150                 INIT_LIST_HEAD(&sh->log_list);
2151                 atomic_set(&sh->count, 1);
2152                 sh->raid_conf = conf;
2153                 sh->log_start = MaxSector;
2154                 for (i = 0; i < disks; i++) {
2155                         struct r5dev *dev = &sh->dev[i];
2156
2157                         bio_init(&dev->req, &dev->vec, 1);
2158                         bio_init(&dev->rreq, &dev->rvec, 1);
2159                 }
2160
2161                 if (raid5_has_ppl(conf)) {
2162                         sh->ppl_page = alloc_page(gfp);
2163                         if (!sh->ppl_page) {
2164                                 free_stripe(sc, sh);
2165                                 sh = NULL;
2166                         }
2167                 }
2168         }
2169         return sh;
2170 }
2171 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2172 {
2173         struct stripe_head *sh;
2174
2175         sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2176         if (!sh)
2177                 return 0;
2178
2179         if (grow_buffers(sh, gfp)) {
2180                 shrink_buffers(sh);
2181                 free_stripe(conf->slab_cache, sh);
2182                 return 0;
2183         }
2184         sh->hash_lock_index =
2185                 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2186         /* we just created an active stripe so... */
2187         atomic_inc(&conf->active_stripes);
2188
2189         raid5_release_stripe(sh);
2190         conf->max_nr_stripes++;
2191         return 1;
2192 }
2193
2194 static int grow_stripes(struct r5conf *conf, int num)
2195 {
2196         struct kmem_cache *sc;
2197         int devs = max(conf->raid_disks, conf->previous_raid_disks);
2198
2199         if (conf->mddev->gendisk)
2200                 sprintf(conf->cache_name[0],
2201                         "raid%d-%s", conf->level, mdname(conf->mddev));
2202         else
2203                 sprintf(conf->cache_name[0],
2204                         "raid%d-%p", conf->level, conf->mddev);
2205         sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
2206
2207         conf->active_name = 0;
2208         sc = kmem_cache_create(conf->cache_name[conf->active_name],
2209                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2210                                0, 0, NULL);
2211         if (!sc)
2212                 return 1;
2213         conf->slab_cache = sc;
2214         conf->pool_size = devs;
2215         while (num--)
2216                 if (!grow_one_stripe(conf, GFP_KERNEL))
2217                         return 1;
2218
2219         return 0;
2220 }
2221
2222 /**
2223  * scribble_len - return the required size of the scribble region
2224  * @num - total number of disks in the array
2225  *
2226  * The size must be enough to contain:
2227  * 1/ a struct page pointer for each device in the array +2
2228  * 2/ room to convert each entry in (1) to its corresponding dma
2229  *    (dma_map_page()) or page (page_address()) address.
2230  *
2231  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2232  * calculate over all devices (not just the data blocks), using zeros in place
2233  * of the P and Q blocks.
2234  */
2235 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2236 {
2237         struct flex_array *ret;
2238         size_t len;
2239
2240         len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2241         ret = flex_array_alloc(len, cnt, flags);
2242         if (!ret)
2243                 return NULL;
2244         /* always prealloc all elements, so no locking is required */
2245         if (flex_array_prealloc(ret, 0, cnt, flags)) {
2246                 flex_array_free(ret);
2247                 return NULL;
2248         }
2249         return ret;
2250 }
2251
2252 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2253 {
2254         unsigned long cpu;
2255         int err = 0;
2256
2257         /*
2258          * Never shrink. And mddev_suspend() could deadlock if this is called
2259          * from raid5d. In that case, scribble_disks and scribble_sectors
2260          * should equal to new_disks and new_sectors
2261          */
2262         if (conf->scribble_disks >= new_disks &&
2263             conf->scribble_sectors >= new_sectors)
2264                 return 0;
2265         mddev_suspend(conf->mddev);
2266         get_online_cpus();
2267         for_each_present_cpu(cpu) {
2268                 struct raid5_percpu *percpu;
2269                 struct flex_array *scribble;
2270
2271                 percpu = per_cpu_ptr(conf->percpu, cpu);
2272                 scribble = scribble_alloc(new_disks,
2273                                           new_sectors / STRIPE_SECTORS,
2274                                           GFP_NOIO);
2275
2276                 if (scribble) {
2277                         flex_array_free(percpu->scribble);
2278                         percpu->scribble = scribble;
2279                 } else {
2280                         err = -ENOMEM;
2281                         break;
2282                 }
2283         }
2284         put_online_cpus();
2285         mddev_resume(conf->mddev);
2286         if (!err) {
2287                 conf->scribble_disks = new_disks;
2288                 conf->scribble_sectors = new_sectors;
2289         }
2290         return err;
2291 }
2292
2293 static int resize_stripes(struct r5conf *conf, int newsize)
2294 {
2295         /* Make all the stripes able to hold 'newsize' devices.
2296          * New slots in each stripe get 'page' set to a new page.
2297          *
2298          * This happens in stages:
2299          * 1/ create a new kmem_cache and allocate the required number of
2300          *    stripe_heads.
2301          * 2/ gather all the old stripe_heads and transfer the pages across
2302          *    to the new stripe_heads.  This will have the side effect of
2303          *    freezing the array as once all stripe_heads have been collected,
2304          *    no IO will be possible.  Old stripe heads are freed once their
2305          *    pages have been transferred over, and the old kmem_cache is
2306          *    freed when all stripes are done.
2307          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2308          *    we simple return a failure status - no need to clean anything up.
2309          * 4/ allocate new pages for the new slots in the new stripe_heads.
2310          *    If this fails, we don't bother trying the shrink the
2311          *    stripe_heads down again, we just leave them as they are.
2312          *    As each stripe_head is processed the new one is released into
2313          *    active service.
2314          *
2315          * Once step2 is started, we cannot afford to wait for a write,
2316          * so we use GFP_NOIO allocations.
2317          */
2318         struct stripe_head *osh, *nsh;
2319         LIST_HEAD(newstripes);
2320         struct disk_info *ndisks;
2321         int err = 0;
2322         struct kmem_cache *sc;
2323         int i;
2324         int hash, cnt;
2325
2326         md_allow_write(conf->mddev);
2327
2328         /* Step 1 */
2329         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2330                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2331                                0, 0, NULL);
2332         if (!sc)
2333                 return -ENOMEM;
2334
2335         /* Need to ensure auto-resizing doesn't interfere */
2336         mutex_lock(&conf->cache_size_mutex);
2337
2338         for (i = conf->max_nr_stripes; i; i--) {
2339                 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2340                 if (!nsh)
2341                         break;
2342
2343                 list_add(&nsh->lru, &newstripes);
2344         }
2345         if (i) {
2346                 /* didn't get enough, give up */
2347                 while (!list_empty(&newstripes)) {
2348                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
2349                         list_del(&nsh->lru);
2350                         free_stripe(sc, nsh);
2351                 }
2352                 kmem_cache_destroy(sc);
2353                 mutex_unlock(&conf->cache_size_mutex);
2354                 return -ENOMEM;
2355         }
2356         /* Step 2 - Must use GFP_NOIO now.
2357          * OK, we have enough stripes, start collecting inactive
2358          * stripes and copying them over
2359          */
2360         hash = 0;
2361         cnt = 0;
2362         list_for_each_entry(nsh, &newstripes, lru) {
2363                 lock_device_hash_lock(conf, hash);
2364                 wait_event_cmd(conf->wait_for_stripe,
2365                                     !list_empty(conf->inactive_list + hash),
2366                                     unlock_device_hash_lock(conf, hash),
2367                                     lock_device_hash_lock(conf, hash));
2368                 osh = get_free_stripe(conf, hash);
2369                 unlock_device_hash_lock(conf, hash);
2370
2371                 for(i=0; i<conf->pool_size; i++) {
2372                         nsh->dev[i].page = osh->dev[i].page;
2373                         nsh->dev[i].orig_page = osh->dev[i].page;
2374                 }
2375                 nsh->hash_lock_index = hash;
2376                 free_stripe(conf->slab_cache, osh);
2377                 cnt++;
2378                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2379                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2380                         hash++;
2381                         cnt = 0;
2382                 }
2383         }
2384         kmem_cache_destroy(conf->slab_cache);
2385
2386         /* Step 3.
2387          * At this point, we are holding all the stripes so the array
2388          * is completely stalled, so now is a good time to resize
2389          * conf->disks and the scribble region
2390          */
2391         ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2392         if (ndisks) {
2393                 for (i = 0; i < conf->pool_size; i++)
2394                         ndisks[i] = conf->disks[i];
2395
2396                 for (i = conf->pool_size; i < newsize; i++) {
2397                         ndisks[i].extra_page = alloc_page(GFP_NOIO);
2398                         if (!ndisks[i].extra_page)
2399                                 err = -ENOMEM;
2400                 }
2401
2402                 if (err) {
2403                         for (i = conf->pool_size; i < newsize; i++)
2404                                 if (ndisks[i].extra_page)
2405                                         put_page(ndisks[i].extra_page);
2406                         kfree(ndisks);
2407                 } else {
2408                         kfree(conf->disks);
2409                         conf->disks = ndisks;
2410                 }
2411         } else
2412                 err = -ENOMEM;
2413
2414         mutex_unlock(&conf->cache_size_mutex);
2415
2416         conf->slab_cache = sc;
2417         conf->active_name = 1-conf->active_name;
2418
2419         /* Step 4, return new stripes to service */
2420         while(!list_empty(&newstripes)) {
2421                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2422                 list_del_init(&nsh->lru);
2423
2424                 for (i=conf->raid_disks; i < newsize; i++)
2425                         if (nsh->dev[i].page == NULL) {
2426                                 struct page *p = alloc_page(GFP_NOIO);
2427                                 nsh->dev[i].page = p;
2428                                 nsh->dev[i].orig_page = p;
2429                                 if (!p)
2430                                         err = -ENOMEM;
2431                         }
2432                 raid5_release_stripe(nsh);
2433         }
2434         /* critical section pass, GFP_NOIO no longer needed */
2435
2436         if (!err)
2437                 conf->pool_size = newsize;
2438         return err;
2439 }
2440
2441 static int drop_one_stripe(struct r5conf *conf)
2442 {
2443         struct stripe_head *sh;
2444         int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2445
2446         spin_lock_irq(conf->hash_locks + hash);
2447         sh = get_free_stripe(conf, hash);
2448         spin_unlock_irq(conf->hash_locks + hash);
2449         if (!sh)
2450                 return 0;
2451         BUG_ON(atomic_read(&sh->count));
2452         shrink_buffers(sh);
2453         free_stripe(conf->slab_cache, sh);
2454         atomic_dec(&conf->active_stripes);
2455         conf->max_nr_stripes--;
2456         return 1;
2457 }
2458
2459 static void shrink_stripes(struct r5conf *conf)
2460 {
2461         while (conf->max_nr_stripes &&
2462                drop_one_stripe(conf))
2463                 ;
2464
2465         kmem_cache_destroy(conf->slab_cache);
2466         conf->slab_cache = NULL;
2467 }
2468
2469 static void raid5_end_read_request(struct bio * bi)
2470 {
2471         struct stripe_head *sh = bi->bi_private;
2472         struct r5conf *conf = sh->raid_conf;
2473         int disks = sh->disks, i;
2474         char b[BDEVNAME_SIZE];
2475         struct md_rdev *rdev = NULL;
2476         sector_t s;
2477
2478         for (i=0 ; i<disks; i++)
2479                 if (bi == &sh->dev[i].req)
2480                         break;
2481
2482         pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2483                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2484                 bi->bi_status);
2485         if (i == disks) {
2486                 bio_reset(bi);
2487                 BUG();
2488                 return;
2489         }
2490         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2491                 /* If replacement finished while this request was outstanding,
2492                  * 'replacement' might be NULL already.
2493                  * In that case it moved down to 'rdev'.
2494                  * rdev is not removed until all requests are finished.
2495                  */
2496                 rdev = conf->disks[i].replacement;
2497         if (!rdev)
2498                 rdev = conf->disks[i].rdev;
2499
2500         if (use_new_offset(conf, sh))
2501                 s = sh->sector + rdev->new_data_offset;
2502         else
2503                 s = sh->sector + rdev->data_offset;
2504         if (!bi->bi_status) {
2505                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2506                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2507                         /* Note that this cannot happen on a
2508                          * replacement device.  We just fail those on
2509                          * any error
2510                          */
2511                         pr_info_ratelimited(
2512                                 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2513                                 mdname(conf->mddev), STRIPE_SECTORS,
2514                                 (unsigned long long)s,
2515                                 bdevname(rdev->bdev, b));
2516                         atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2517                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2518                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2519                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2520                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2521
2522                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2523                         /*
2524                          * end read for a page in journal, this
2525                          * must be preparing for prexor in rmw
2526                          */
2527                         set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2528
2529                 if (atomic_read(&rdev->read_errors))
2530                         atomic_set(&rdev->read_errors, 0);
2531         } else {
2532                 const char *bdn = bdevname(rdev->bdev, b);
2533                 int retry = 0;
2534                 int set_bad = 0;
2535
2536                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2537                 atomic_inc(&rdev->read_errors);
2538                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2539                         pr_warn_ratelimited(
2540                                 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2541                                 mdname(conf->mddev),
2542                                 (unsigned long long)s,
2543                                 bdn);
2544                 else if (conf->mddev->degraded >= conf->max_degraded) {
2545                         set_bad = 1;
2546                         pr_warn_ratelimited(
2547                                 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2548                                 mdname(conf->mddev),
2549                                 (unsigned long long)s,
2550                                 bdn);
2551                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2552                         /* Oh, no!!! */
2553                         set_bad = 1;
2554                         pr_warn_ratelimited(
2555                                 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2556                                 mdname(conf->mddev),
2557                                 (unsigned long long)s,
2558                                 bdn);
2559                 } else if (atomic_read(&rdev->read_errors)
2560                          > conf->max_nr_stripes)
2561                         pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2562                                mdname(conf->mddev), bdn);
2563                 else
2564                         retry = 1;
2565                 if (set_bad && test_bit(In_sync, &rdev->flags)
2566                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2567                         retry = 1;
2568                 if (retry)
2569                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2570                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2571                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2572                         } else
2573                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2574                 else {
2575                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2576                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2577                         if (!(set_bad
2578                               && test_bit(In_sync, &rdev->flags)
2579                               && rdev_set_badblocks(
2580                                       rdev, sh->sector, STRIPE_SECTORS, 0)))
2581                                 md_error(conf->mddev, rdev);
2582                 }
2583         }
2584         rdev_dec_pending(rdev, conf->mddev);
2585         bio_reset(bi);
2586         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2587         set_bit(STRIPE_HANDLE, &sh->state);
2588         raid5_release_stripe(sh);
2589 }
2590
2591 static void raid5_end_write_request(struct bio *bi)
2592 {
2593         struct stripe_head *sh = bi->bi_private;
2594         struct r5conf *conf = sh->raid_conf;
2595         int disks = sh->disks, i;
2596         struct md_rdev *uninitialized_var(rdev);
2597         sector_t first_bad;
2598         int bad_sectors;
2599         int replacement = 0;
2600
2601         for (i = 0 ; i < disks; i++) {
2602                 if (bi == &sh->dev[i].req) {
2603                         rdev = conf->disks[i].rdev;
2604                         break;
2605                 }
2606                 if (bi == &sh->dev[i].rreq) {
2607                         rdev = conf->disks[i].replacement;
2608                         if (rdev)
2609                                 replacement = 1;
2610                         else
2611                                 /* rdev was removed and 'replacement'
2612                                  * replaced it.  rdev is not removed
2613                                  * until all requests are finished.
2614                                  */
2615                                 rdev = conf->disks[i].rdev;
2616                         break;
2617                 }
2618         }
2619         pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2620                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2621                 bi->bi_status);
2622         if (i == disks) {
2623                 bio_reset(bi);
2624                 BUG();
2625                 return;
2626         }
2627
2628         if (replacement) {
2629                 if (bi->bi_status)
2630                         md_error(conf->mddev, rdev);
2631                 else if (is_badblock(rdev, sh->sector,
2632                                      STRIPE_SECTORS,
2633                                      &first_bad, &bad_sectors))
2634                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2635         } else {
2636                 if (bi->bi_status) {
2637                         set_bit(STRIPE_DEGRADED, &sh->state);
2638                         set_bit(WriteErrorSeen, &rdev->flags);
2639                         set_bit(R5_WriteError, &sh->dev[i].flags);
2640                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2641                                 set_bit(MD_RECOVERY_NEEDED,
2642                                         &rdev->mddev->recovery);
2643                 } else if (is_badblock(rdev, sh->sector,
2644                                        STRIPE_SECTORS,
2645                                        &first_bad, &bad_sectors)) {
2646                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2647                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2648                                 /* That was a successful write so make
2649                                  * sure it looks like we already did
2650                                  * a re-write.
2651                                  */
2652                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2653                 }
2654         }
2655         rdev_dec_pending(rdev, conf->mddev);
2656
2657         if (sh->batch_head && bi->bi_status && !replacement)
2658                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2659
2660         bio_reset(bi);
2661         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2662                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2663         set_bit(STRIPE_HANDLE, &sh->state);
2664         raid5_release_stripe(sh);
2665
2666         if (sh->batch_head && sh != sh->batch_head)
2667                 raid5_release_stripe(sh->batch_head);
2668 }
2669
2670 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2671 {
2672         char b[BDEVNAME_SIZE];
2673         struct r5conf *conf = mddev->private;
2674         unsigned long flags;
2675         pr_debug("raid456: error called\n");
2676
2677         spin_lock_irqsave(&conf->device_lock, flags);
2678         clear_bit(In_sync, &rdev->flags);
2679         mddev->degraded = raid5_calc_degraded(conf);
2680         spin_unlock_irqrestore(&conf->device_lock, flags);
2681         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2682
2683         set_bit(Blocked, &rdev->flags);
2684         set_bit(Faulty, &rdev->flags);
2685         set_mask_bits(&mddev->sb_flags, 0,
2686                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2687         pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2688                 "md/raid:%s: Operation continuing on %d devices.\n",
2689                 mdname(mddev),
2690                 bdevname(rdev->bdev, b),
2691                 mdname(mddev),
2692                 conf->raid_disks - mddev->degraded);
2693         r5c_update_on_rdev_error(mddev, rdev);
2694 }
2695
2696 /*
2697  * Input: a 'big' sector number,
2698  * Output: index of the data and parity disk, and the sector # in them.
2699  */
2700 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2701                               int previous, int *dd_idx,
2702                               struct stripe_head *sh)
2703 {
2704         sector_t stripe, stripe2;
2705         sector_t chunk_number;
2706         unsigned int chunk_offset;
2707         int pd_idx, qd_idx;
2708         int ddf_layout = 0;
2709         sector_t new_sector;
2710         int algorithm = previous ? conf->prev_algo
2711                                  : conf->algorithm;
2712         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2713                                          : conf->chunk_sectors;
2714         int raid_disks = previous ? conf->previous_raid_disks
2715                                   : conf->raid_disks;
2716         int data_disks = raid_disks - conf->max_degraded;
2717
2718         /* First compute the information on this sector */
2719
2720         /*
2721          * Compute the chunk number and the sector offset inside the chunk
2722          */
2723         chunk_offset = sector_div(r_sector, sectors_per_chunk);
2724         chunk_number = r_sector;
2725
2726         /*
2727          * Compute the stripe number
2728          */
2729         stripe = chunk_number;
2730         *dd_idx = sector_div(stripe, data_disks);
2731         stripe2 = stripe;
2732         /*
2733          * Select the parity disk based on the user selected algorithm.
2734          */
2735         pd_idx = qd_idx = -1;
2736         switch(conf->level) {
2737         case 4:
2738                 pd_idx = data_disks;
2739                 break;
2740         case 5:
2741                 switch (algorithm) {
2742                 case ALGORITHM_LEFT_ASYMMETRIC:
2743                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2744                         if (*dd_idx >= pd_idx)
2745                                 (*dd_idx)++;
2746                         break;
2747                 case ALGORITHM_RIGHT_ASYMMETRIC:
2748                         pd_idx = sector_div(stripe2, raid_disks);
2749                         if (*dd_idx >= pd_idx)
2750                                 (*dd_idx)++;
2751                         break;
2752                 case ALGORITHM_LEFT_SYMMETRIC:
2753                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
2754                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2755                         break;
2756                 case ALGORITHM_RIGHT_SYMMETRIC:
2757                         pd_idx = sector_div(stripe2, raid_disks);
2758                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2759                         break;
2760                 case ALGORITHM_PARITY_0:
2761                         pd_idx = 0;
2762                         (*dd_idx)++;
2763                         break;
2764                 case ALGORITHM_PARITY_N:
2765                         pd_idx = data_disks;
2766                         break;
2767                 default:
2768                         BUG();
2769                 }
2770                 break;
2771         case 6:
2772
2773                 switch (algorithm) {
2774                 case ALGORITHM_LEFT_ASYMMETRIC:
2775                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2776                         qd_idx = pd_idx + 1;
2777                         if (pd_idx == raid_disks-1) {
2778                                 (*dd_idx)++;    /* Q D D D P */
2779                                 qd_idx = 0;
2780                         } else if (*dd_idx >= pd_idx)
2781                                 (*dd_idx) += 2; /* D D P Q D */
2782                         break;
2783                 case ALGORITHM_RIGHT_ASYMMETRIC:
2784                         pd_idx = sector_div(stripe2, raid_disks);
2785                         qd_idx = pd_idx + 1;
2786                         if (pd_idx == raid_disks-1) {
2787                                 (*dd_idx)++;    /* Q D D D P */
2788                                 qd_idx = 0;
2789                         } else if (*dd_idx >= pd_idx)
2790                                 (*dd_idx) += 2; /* D D P Q D */
2791                         break;
2792                 case ALGORITHM_LEFT_SYMMETRIC:
2793                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2794                         qd_idx = (pd_idx + 1) % raid_disks;
2795                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2796                         break;
2797                 case ALGORITHM_RIGHT_SYMMETRIC:
2798                         pd_idx = sector_div(stripe2, raid_disks);
2799                         qd_idx = (pd_idx + 1) % raid_disks;
2800                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2801                         break;
2802
2803                 case ALGORITHM_PARITY_0:
2804                         pd_idx = 0;
2805                         qd_idx = 1;
2806                         (*dd_idx) += 2;
2807                         break;
2808                 case ALGORITHM_PARITY_N:
2809                         pd_idx = data_disks;
2810                         qd_idx = data_disks + 1;
2811                         break;
2812
2813                 case ALGORITHM_ROTATING_ZERO_RESTART:
2814                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
2815                          * of blocks for computing Q is different.
2816                          */
2817                         pd_idx = sector_div(stripe2, raid_disks);
2818                         qd_idx = pd_idx + 1;
2819                         if (pd_idx == raid_disks-1) {
2820                                 (*dd_idx)++;    /* Q D D D P */
2821                                 qd_idx = 0;
2822                         } else if (*dd_idx >= pd_idx)
2823                                 (*dd_idx) += 2; /* D D P Q D */
2824                         ddf_layout = 1;
2825                         break;
2826
2827                 case ALGORITHM_ROTATING_N_RESTART:
2828                         /* Same a left_asymmetric, by first stripe is
2829                          * D D D P Q  rather than
2830                          * Q D D D P
2831                          */
2832                         stripe2 += 1;
2833                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2834                         qd_idx = pd_idx + 1;
2835                         if (pd_idx == raid_disks-1) {
2836                                 (*dd_idx)++;    /* Q D D D P */
2837                                 qd_idx = 0;
2838                         } else if (*dd_idx >= pd_idx)
2839                                 (*dd_idx) += 2; /* D D P Q D */
2840                         ddf_layout = 1;
2841                         break;
2842
2843                 case ALGORITHM_ROTATING_N_CONTINUE:
2844                         /* Same as left_symmetric but Q is before P */
2845                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2846                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2847                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2848                         ddf_layout = 1;
2849                         break;
2850
2851                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2852                         /* RAID5 left_asymmetric, with Q on last device */
2853                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2854                         if (*dd_idx >= pd_idx)
2855                                 (*dd_idx)++;
2856                         qd_idx = raid_disks - 1;
2857                         break;
2858
2859                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2860                         pd_idx = sector_div(stripe2, raid_disks-1);
2861                         if (*dd_idx >= pd_idx)
2862                                 (*dd_idx)++;
2863                         qd_idx = raid_disks - 1;
2864                         break;
2865
2866                 case ALGORITHM_LEFT_SYMMETRIC_6:
2867                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2868                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2869                         qd_idx = raid_disks - 1;
2870                         break;
2871
2872                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2873                         pd_idx = sector_div(stripe2, raid_disks-1);
2874                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2875                         qd_idx = raid_disks - 1;
2876                         break;
2877
2878                 case ALGORITHM_PARITY_0_6:
2879                         pd_idx = 0;
2880                         (*dd_idx)++;
2881                         qd_idx = raid_disks - 1;
2882                         break;
2883
2884                 default:
2885                         BUG();
2886                 }
2887                 break;
2888         }
2889
2890         if (sh) {
2891                 sh->pd_idx = pd_idx;
2892                 sh->qd_idx = qd_idx;
2893                 sh->ddf_layout = ddf_layout;
2894         }
2895         /*
2896          * Finally, compute the new sector number
2897          */
2898         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2899         return new_sector;
2900 }
2901
2902 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2903 {
2904         struct r5conf *conf = sh->raid_conf;
2905         int raid_disks = sh->disks;
2906         int data_disks = raid_disks - conf->max_degraded;
2907         sector_t new_sector = sh->sector, check;
2908         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2909                                          : conf->chunk_sectors;
2910         int algorithm = previous ? conf->prev_algo
2911                                  : conf->algorithm;
2912         sector_t stripe;
2913         int chunk_offset;
2914         sector_t chunk_number;
2915         int dummy1, dd_idx = i;
2916         sector_t r_sector;
2917         struct stripe_head sh2;
2918
2919         chunk_offset = sector_div(new_sector, sectors_per_chunk);
2920         stripe = new_sector;
2921
2922         if (i == sh->pd_idx)
2923                 return 0;
2924         switch(conf->level) {
2925         case 4: break;
2926         case 5:
2927                 switch (algorithm) {
2928                 case ALGORITHM_LEFT_ASYMMETRIC:
2929                 case ALGORITHM_RIGHT_ASYMMETRIC:
2930                         if (i > sh->pd_idx)
2931                                 i--;
2932                         break;
2933                 case ALGORITHM_LEFT_SYMMETRIC:
2934                 case ALGORITHM_RIGHT_SYMMETRIC:
2935                         if (i < sh->pd_idx)
2936                                 i += raid_disks;
2937                         i -= (sh->pd_idx + 1);
2938                         break;
2939                 case ALGORITHM_PARITY_0:
2940                         i -= 1;
2941                         break;
2942                 case ALGORITHM_PARITY_N:
2943                         break;
2944                 default:
2945                         BUG();
2946                 }
2947                 break;
2948         case 6:
2949                 if (i == sh->qd_idx)
2950                         return 0; /* It is the Q disk */
2951                 switch (algorithm) {
2952                 case ALGORITHM_LEFT_ASYMMETRIC:
2953                 case ALGORITHM_RIGHT_ASYMMETRIC:
2954                 case ALGORITHM_ROTATING_ZERO_RESTART:
2955                 case ALGORITHM_ROTATING_N_RESTART:
2956                         if (sh->pd_idx == raid_disks-1)
2957                                 i--;    /* Q D D D P */
2958                         else if (i > sh->pd_idx)
2959                                 i -= 2; /* D D P Q D */
2960                         break;
2961                 case ALGORITHM_LEFT_SYMMETRIC:
2962                 case ALGORITHM_RIGHT_SYMMETRIC:
2963                         if (sh->pd_idx == raid_disks-1)
2964                                 i--; /* Q D D D P */
2965                         else {
2966                                 /* D D P Q D */
2967                                 if (i < sh->pd_idx)
2968                                         i += raid_disks;
2969                                 i -= (sh->pd_idx + 2);
2970                         }
2971                         break;
2972                 case ALGORITHM_PARITY_0:
2973                         i -= 2;
2974                         break;
2975                 case ALGORITHM_PARITY_N:
2976                         break;
2977                 case ALGORITHM_ROTATING_N_CONTINUE:
2978                         /* Like left_symmetric, but P is before Q */
2979                         if (sh->pd_idx == 0)
2980                                 i--;    /* P D D D Q */
2981                         else {
2982                                 /* D D Q P D */
2983                                 if (i < sh->pd_idx)
2984                                         i += raid_disks;
2985                                 i -= (sh->pd_idx + 1);
2986                         }
2987                         break;
2988                 case ALGORITHM_LEFT_ASYMMETRIC_6:
2989                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2990                         if (i > sh->pd_idx)
2991                                 i--;
2992                         break;
2993                 case ALGORITHM_LEFT_SYMMETRIC_6:
2994                 case ALGORITHM_RIGHT_SYMMETRIC_6:
2995                         if (i < sh->pd_idx)
2996                                 i += data_disks + 1;
2997                         i -= (sh->pd_idx + 1);
2998                         break;
2999                 case ALGORITHM_PARITY_0_6:
3000                         i -= 1;
3001                         break;
3002                 default:
3003                         BUG();
3004                 }
3005                 break;
3006         }
3007
3008         chunk_number = stripe * data_disks + i;
3009         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3010
3011         check = raid5_compute_sector(conf, r_sector,
3012                                      previous, &dummy1, &sh2);
3013         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3014                 || sh2.qd_idx != sh->qd_idx) {
3015                 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3016                         mdname(conf->mddev));
3017                 return 0;
3018         }
3019         return r_sector;
3020 }
3021
3022 /*
3023  * There are cases where we want handle_stripe_dirtying() and
3024  * schedule_reconstruction() to delay towrite to some dev of a stripe.
3025  *
3026  * This function checks whether we want to delay the towrite. Specifically,
3027  * we delay the towrite when:
3028  *
3029  *   1. degraded stripe has a non-overwrite to the missing dev, AND this
3030  *      stripe has data in journal (for other devices).
3031  *
3032  *      In this case, when reading data for the non-overwrite dev, it is
3033  *      necessary to handle complex rmw of write back cache (prexor with
3034  *      orig_page, and xor with page). To keep read path simple, we would
3035  *      like to flush data in journal to RAID disks first, so complex rmw
3036  *      is handled in the write patch (handle_stripe_dirtying).
3037  *
3038  *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
3039  *
3040  *      It is important to be able to flush all stripes in raid5-cache.
3041  *      Therefore, we need reserve some space on the journal device for
3042  *      these flushes. If flush operation includes pending writes to the
3043  *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3044  *      for the flush out. If we exclude these pending writes from flush
3045  *      operation, we only need (conf->max_degraded + 1) pages per stripe.
3046  *      Therefore, excluding pending writes in these cases enables more
3047  *      efficient use of the journal device.
3048  *
3049  *      Note: To make sure the stripe makes progress, we only delay
3050  *      towrite for stripes with data already in journal (injournal > 0).
3051  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3052  *      no_space_stripes list.
3053  *
3054  *   3. during journal failure
3055  *      In journal failure, we try to flush all cached data to raid disks
3056  *      based on data in stripe cache. The array is read-only to upper
3057  *      layers, so we would skip all pending writes.
3058  *
3059  */
3060 static inline bool delay_towrite(struct r5conf *conf,
3061                                  struct r5dev *dev,
3062                                  struct stripe_head_state *s)
3063 {
3064         /* case 1 above */
3065         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3066             !test_bit(R5_Insync, &dev->flags) && s->injournal)
3067                 return true;
3068         /* case 2 above */
3069         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3070             s->injournal > 0)
3071                 return true;
3072         /* case 3 above */
3073         if (s->log_failed && s->injournal)
3074                 return true;
3075         return false;
3076 }
3077
3078 static void
3079 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3080                          int rcw, int expand)
3081 {
3082         int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3083         struct r5conf *conf = sh->raid_conf;
3084         int level = conf->level;
3085
3086         if (rcw) {
3087                 /*
3088                  * In some cases, handle_stripe_dirtying initially decided to
3089                  * run rmw and allocates extra page for prexor. However, rcw is
3090                  * cheaper later on. We need to free the extra page now,
3091                  * because we won't be able to do that in ops_complete_prexor().
3092                  */
3093                 r5c_release_extra_page(sh);
3094
3095                 for (i = disks; i--; ) {
3096                         struct r5dev *dev = &sh->dev[i];
3097
3098                         if (dev->towrite && !delay_towrite(conf, dev, s)) {
3099                                 set_bit(R5_LOCKED, &dev->flags);
3100                                 set_bit(R5_Wantdrain, &dev->flags);
3101                                 if (!expand)
3102                                         clear_bit(R5_UPTODATE, &dev->flags);
3103                                 s->locked++;
3104                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3105                                 set_bit(R5_LOCKED, &dev->flags);
3106                                 s->locked++;
3107                         }
3108                 }
3109                 /* if we are not expanding this is a proper write request, and
3110                  * there will be bios with new data to be drained into the
3111                  * stripe cache
3112                  */
3113                 if (!expand) {
3114                         if (!s->locked)
3115                                 /* False alarm, nothing to do */
3116                                 return;
3117                         sh->reconstruct_state = reconstruct_state_drain_run;
3118                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3119                 } else
3120                         sh->reconstruct_state = reconstruct_state_run;
3121
3122                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3123
3124                 if (s->locked + conf->max_degraded == disks)
3125                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3126                                 atomic_inc(&conf->pending_full_writes);
3127         } else {
3128                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3129                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3130                 BUG_ON(level == 6 &&
3131                         (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3132                            test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3133
3134                 for (i = disks; i--; ) {
3135                         struct r5dev *dev = &sh->dev[i];
3136                         if (i == pd_idx || i == qd_idx)
3137                                 continue;
3138
3139                         if (dev->towrite &&
3140                             (test_bit(R5_UPTODATE, &dev->flags) ||
3141                              test_bit(R5_Wantcompute, &dev->flags))) {
3142                                 set_bit(R5_Wantdrain, &dev->flags);
3143                                 set_bit(R5_LOCKED, &dev->flags);
3144                                 clear_bit(R5_UPTODATE, &dev->flags);
3145                                 s->locked++;
3146                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3147                                 set_bit(R5_LOCKED, &dev->flags);
3148                                 s->locked++;
3149                         }
3150                 }
3151                 if (!s->locked)
3152                         /* False alarm - nothing to do */
3153                         return;
3154                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3155                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3156                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3157                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3158         }
3159
3160         /* keep the parity disk(s) locked while asynchronous operations
3161          * are in flight
3162          */
3163         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3164         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3165         s->locked++;
3166
3167         if (level == 6) {
3168                 int qd_idx = sh->qd_idx;
3169                 struct r5dev *dev = &sh->dev[qd_idx];
3170
3171                 set_bit(R5_LOCKED, &dev->flags);
3172                 clear_bit(R5_UPTODATE, &dev->flags);
3173                 s->locked++;
3174         }
3175
3176         if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3177             test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&