Btrfs: fix scrub race leading to use-after-free
[muen/linux.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31 #include "raid56.h"
32
33 /*
34  * This is only the first step towards a full-features scrub. It reads all
35  * extent and super block and verifies the checksums. In case a bad checksum
36  * is found or the extent cannot be read, good data will be written back if
37  * any can be found.
38  *
39  * Future enhancements:
40  *  - In case an unrepairable extent is encountered, track which files are
41  *    affected and report them
42  *  - track and record media errors, throw out bad devices
43  *  - add a mode to also read unallocated space
44  */
45
46 struct scrub_block;
47 struct scrub_ctx;
48
49 /*
50  * the following three values only influence the performance.
51  * The last one configures the number of parallel and outstanding I/O
52  * operations. The first two values configure an upper limit for the number
53  * of (dynamically allocated) pages that are added to a bio.
54  */
55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
58
59 /*
60  * the following value times PAGE_SIZE needs to be large enough to match the
61  * largest node/leaf/sector size that shall be supported.
62  * Values larger than BTRFS_STRIPE_LEN are not supported.
63  */
64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
65
66 struct scrub_recover {
67         atomic_t                refs;
68         struct btrfs_bio        *bbio;
69         u64                     map_length;
70 };
71
72 struct scrub_page {
73         struct scrub_block      *sblock;
74         struct page             *page;
75         struct btrfs_device     *dev;
76         struct list_head        list;
77         u64                     flags;  /* extent flags */
78         u64                     generation;
79         u64                     logical;
80         u64                     physical;
81         u64                     physical_for_dev_replace;
82         atomic_t                refs;
83         struct {
84                 unsigned int    mirror_num:8;
85                 unsigned int    have_csum:1;
86                 unsigned int    io_error:1;
87         };
88         u8                      csum[BTRFS_CSUM_SIZE];
89
90         struct scrub_recover    *recover;
91 };
92
93 struct scrub_bio {
94         int                     index;
95         struct scrub_ctx        *sctx;
96         struct btrfs_device     *dev;
97         struct bio              *bio;
98         int                     err;
99         u64                     logical;
100         u64                     physical;
101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
103 #else
104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
105 #endif
106         int                     page_count;
107         int                     next_free;
108         struct btrfs_work       work;
109 };
110
111 struct scrub_block {
112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
113         int                     page_count;
114         atomic_t                outstanding_pages;
115         atomic_t                refs; /* free mem on transition to zero */
116         struct scrub_ctx        *sctx;
117         struct scrub_parity     *sparity;
118         struct {
119                 unsigned int    header_error:1;
120                 unsigned int    checksum_error:1;
121                 unsigned int    no_io_error_seen:1;
122                 unsigned int    generation_error:1; /* also sets header_error */
123
124                 /* The following is for the data used to check parity */
125                 /* It is for the data with checksum */
126                 unsigned int    data_corrected:1;
127         };
128 };
129
130 /* Used for the chunks with parity stripe such RAID5/6 */
131 struct scrub_parity {
132         struct scrub_ctx        *sctx;
133
134         struct btrfs_device     *scrub_dev;
135
136         u64                     logic_start;
137
138         u64                     logic_end;
139
140         int                     nsectors;
141
142         int                     stripe_len;
143
144         atomic_t                refs;
145
146         struct list_head        spages;
147
148         /* Work of parity check and repair */
149         struct btrfs_work       work;
150
151         /* Mark the parity blocks which have data */
152         unsigned long           *dbitmap;
153
154         /*
155          * Mark the parity blocks which have data, but errors happen when
156          * read data or check data
157          */
158         unsigned long           *ebitmap;
159
160         unsigned long           bitmap[0];
161 };
162
163 struct scrub_wr_ctx {
164         struct scrub_bio *wr_curr_bio;
165         struct btrfs_device *tgtdev;
166         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
167         atomic_t flush_all_writes;
168         struct mutex wr_lock;
169 };
170
171 struct scrub_ctx {
172         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
173         struct btrfs_root       *dev_root;
174         int                     first_free;
175         int                     curr;
176         atomic_t                bios_in_flight;
177         atomic_t                workers_pending;
178         spinlock_t              list_lock;
179         wait_queue_head_t       list_wait;
180         u16                     csum_size;
181         struct list_head        csum_list;
182         atomic_t                cancel_req;
183         int                     readonly;
184         int                     pages_per_rd_bio;
185         u32                     sectorsize;
186         u32                     nodesize;
187
188         int                     is_dev_replace;
189         struct scrub_wr_ctx     wr_ctx;
190
191         /*
192          * statistics
193          */
194         struct btrfs_scrub_progress stat;
195         spinlock_t              stat_lock;
196 };
197
198 struct scrub_fixup_nodatasum {
199         struct scrub_ctx        *sctx;
200         struct btrfs_device     *dev;
201         u64                     logical;
202         struct btrfs_root       *root;
203         struct btrfs_work       work;
204         int                     mirror_num;
205 };
206
207 struct scrub_nocow_inode {
208         u64                     inum;
209         u64                     offset;
210         u64                     root;
211         struct list_head        list;
212 };
213
214 struct scrub_copy_nocow_ctx {
215         struct scrub_ctx        *sctx;
216         u64                     logical;
217         u64                     len;
218         int                     mirror_num;
219         u64                     physical_for_dev_replace;
220         struct list_head        inodes;
221         struct btrfs_work       work;
222 };
223
224 struct scrub_warning {
225         struct btrfs_path       *path;
226         u64                     extent_item_size;
227         const char              *errstr;
228         sector_t                sector;
229         u64                     logical;
230         struct btrfs_device     *dev;
231 };
232
233 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
234 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
235 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
236 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
237 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
238 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
239                                      struct scrub_block *sblocks_for_recheck);
240 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
241                                 struct scrub_block *sblock, int is_metadata,
242                                 int have_csum, u8 *csum, u64 generation,
243                                 u16 csum_size, int retry_failed_mirror);
244 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
245                                          struct scrub_block *sblock,
246                                          int is_metadata, int have_csum,
247                                          const u8 *csum, u64 generation,
248                                          u16 csum_size);
249 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
250                                              struct scrub_block *sblock_good);
251 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
252                                             struct scrub_block *sblock_good,
253                                             int page_num, int force_write);
254 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
255 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
256                                            int page_num);
257 static int scrub_checksum_data(struct scrub_block *sblock);
258 static int scrub_checksum_tree_block(struct scrub_block *sblock);
259 static int scrub_checksum_super(struct scrub_block *sblock);
260 static void scrub_block_get(struct scrub_block *sblock);
261 static void scrub_block_put(struct scrub_block *sblock);
262 static void scrub_page_get(struct scrub_page *spage);
263 static void scrub_page_put(struct scrub_page *spage);
264 static void scrub_parity_get(struct scrub_parity *sparity);
265 static void scrub_parity_put(struct scrub_parity *sparity);
266 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
267                                     struct scrub_page *spage);
268 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
269                        u64 physical, struct btrfs_device *dev, u64 flags,
270                        u64 gen, int mirror_num, u8 *csum, int force,
271                        u64 physical_for_dev_replace);
272 static void scrub_bio_end_io(struct bio *bio, int err);
273 static void scrub_bio_end_io_worker(struct btrfs_work *work);
274 static void scrub_block_complete(struct scrub_block *sblock);
275 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
276                                u64 extent_logical, u64 extent_len,
277                                u64 *extent_physical,
278                                struct btrfs_device **extent_dev,
279                                int *extent_mirror_num);
280 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
281                               struct scrub_wr_ctx *wr_ctx,
282                               struct btrfs_fs_info *fs_info,
283                               struct btrfs_device *dev,
284                               int is_dev_replace);
285 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
286 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
287                                     struct scrub_page *spage);
288 static void scrub_wr_submit(struct scrub_ctx *sctx);
289 static void scrub_wr_bio_end_io(struct bio *bio, int err);
290 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
291 static int write_page_nocow(struct scrub_ctx *sctx,
292                             u64 physical_for_dev_replace, struct page *page);
293 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
294                                       struct scrub_copy_nocow_ctx *ctx);
295 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
296                             int mirror_num, u64 physical_for_dev_replace);
297 static void copy_nocow_pages_worker(struct btrfs_work *work);
298 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
299 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
300
301
302 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
303 {
304         atomic_inc(&sctx->bios_in_flight);
305 }
306
307 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
308 {
309         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
310
311         /*
312          * Hold the scrub_lock while doing the wakeup to ensure the
313          * sctx (and its wait queue list_wait) isn't destroyed/freed
314          * during the wakeup.
315          */
316         mutex_lock(&fs_info->scrub_lock);
317         atomic_dec(&sctx->bios_in_flight);
318         wake_up(&sctx->list_wait);
319         mutex_unlock(&fs_info->scrub_lock);
320 }
321
322 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
323 {
324         while (atomic_read(&fs_info->scrub_pause_req)) {
325                 mutex_unlock(&fs_info->scrub_lock);
326                 wait_event(fs_info->scrub_pause_wait,
327                    atomic_read(&fs_info->scrub_pause_req) == 0);
328                 mutex_lock(&fs_info->scrub_lock);
329         }
330 }
331
332 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
333 {
334         atomic_inc(&fs_info->scrubs_paused);
335         wake_up(&fs_info->scrub_pause_wait);
336
337         mutex_lock(&fs_info->scrub_lock);
338         __scrub_blocked_if_needed(fs_info);
339         atomic_dec(&fs_info->scrubs_paused);
340         mutex_unlock(&fs_info->scrub_lock);
341
342         wake_up(&fs_info->scrub_pause_wait);
343 }
344
345 /*
346  * used for workers that require transaction commits (i.e., for the
347  * NOCOW case)
348  */
349 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
350 {
351         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
352
353         /*
354          * increment scrubs_running to prevent cancel requests from
355          * completing as long as a worker is running. we must also
356          * increment scrubs_paused to prevent deadlocking on pause
357          * requests used for transactions commits (as the worker uses a
358          * transaction context). it is safe to regard the worker
359          * as paused for all matters practical. effectively, we only
360          * avoid cancellation requests from completing.
361          */
362         mutex_lock(&fs_info->scrub_lock);
363         atomic_inc(&fs_info->scrubs_running);
364         atomic_inc(&fs_info->scrubs_paused);
365         mutex_unlock(&fs_info->scrub_lock);
366
367         /*
368          * check if @scrubs_running=@scrubs_paused condition
369          * inside wait_event() is not an atomic operation.
370          * which means we may inc/dec @scrub_running/paused
371          * at any time. Let's wake up @scrub_pause_wait as
372          * much as we can to let commit transaction blocked less.
373          */
374         wake_up(&fs_info->scrub_pause_wait);
375
376         atomic_inc(&sctx->workers_pending);
377 }
378
379 /* used for workers that require transaction commits */
380 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
381 {
382         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
383
384         /*
385          * see scrub_pending_trans_workers_inc() why we're pretending
386          * to be paused in the scrub counters
387          */
388         mutex_lock(&fs_info->scrub_lock);
389         atomic_dec(&fs_info->scrubs_running);
390         atomic_dec(&fs_info->scrubs_paused);
391         atomic_dec(&sctx->workers_pending);
392         wake_up(&fs_info->scrub_pause_wait);
393         /*
394          * Hold the scrub_lock while doing the wakeup to ensure the
395          * sctx (and its wait queue list_wait) isn't destroyed/freed
396          * during the wakeup.
397          */
398         wake_up(&sctx->list_wait);
399         mutex_unlock(&fs_info->scrub_lock);
400 }
401
402 static void scrub_free_csums(struct scrub_ctx *sctx)
403 {
404         while (!list_empty(&sctx->csum_list)) {
405                 struct btrfs_ordered_sum *sum;
406                 sum = list_first_entry(&sctx->csum_list,
407                                        struct btrfs_ordered_sum, list);
408                 list_del(&sum->list);
409                 kfree(sum);
410         }
411 }
412
413 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
414 {
415         int i;
416
417         if (!sctx)
418                 return;
419
420         scrub_free_wr_ctx(&sctx->wr_ctx);
421
422         /* this can happen when scrub is cancelled */
423         if (sctx->curr != -1) {
424                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
425
426                 for (i = 0; i < sbio->page_count; i++) {
427                         WARN_ON(!sbio->pagev[i]->page);
428                         scrub_block_put(sbio->pagev[i]->sblock);
429                 }
430                 bio_put(sbio->bio);
431         }
432
433         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
434                 struct scrub_bio *sbio = sctx->bios[i];
435
436                 if (!sbio)
437                         break;
438                 kfree(sbio);
439         }
440
441         scrub_free_csums(sctx);
442         kfree(sctx);
443 }
444
445 static noinline_for_stack
446 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
447 {
448         struct scrub_ctx *sctx;
449         int             i;
450         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
451         int pages_per_rd_bio;
452         int ret;
453
454         /*
455          * the setting of pages_per_rd_bio is correct for scrub but might
456          * be wrong for the dev_replace code where we might read from
457          * different devices in the initial huge bios. However, that
458          * code is able to correctly handle the case when adding a page
459          * to a bio fails.
460          */
461         if (dev->bdev)
462                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
463                                          bio_get_nr_vecs(dev->bdev));
464         else
465                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
466         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
467         if (!sctx)
468                 goto nomem;
469         sctx->is_dev_replace = is_dev_replace;
470         sctx->pages_per_rd_bio = pages_per_rd_bio;
471         sctx->curr = -1;
472         sctx->dev_root = dev->dev_root;
473         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
474                 struct scrub_bio *sbio;
475
476                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
477                 if (!sbio)
478                         goto nomem;
479                 sctx->bios[i] = sbio;
480
481                 sbio->index = i;
482                 sbio->sctx = sctx;
483                 sbio->page_count = 0;
484                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
485                                 scrub_bio_end_io_worker, NULL, NULL);
486
487                 if (i != SCRUB_BIOS_PER_SCTX - 1)
488                         sctx->bios[i]->next_free = i + 1;
489                 else
490                         sctx->bios[i]->next_free = -1;
491         }
492         sctx->first_free = 0;
493         sctx->nodesize = dev->dev_root->nodesize;
494         sctx->sectorsize = dev->dev_root->sectorsize;
495         atomic_set(&sctx->bios_in_flight, 0);
496         atomic_set(&sctx->workers_pending, 0);
497         atomic_set(&sctx->cancel_req, 0);
498         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
499         INIT_LIST_HEAD(&sctx->csum_list);
500
501         spin_lock_init(&sctx->list_lock);
502         spin_lock_init(&sctx->stat_lock);
503         init_waitqueue_head(&sctx->list_wait);
504
505         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
506                                  fs_info->dev_replace.tgtdev, is_dev_replace);
507         if (ret) {
508                 scrub_free_ctx(sctx);
509                 return ERR_PTR(ret);
510         }
511         return sctx;
512
513 nomem:
514         scrub_free_ctx(sctx);
515         return ERR_PTR(-ENOMEM);
516 }
517
518 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
519                                      void *warn_ctx)
520 {
521         u64 isize;
522         u32 nlink;
523         int ret;
524         int i;
525         struct extent_buffer *eb;
526         struct btrfs_inode_item *inode_item;
527         struct scrub_warning *swarn = warn_ctx;
528         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
529         struct inode_fs_paths *ipath = NULL;
530         struct btrfs_root *local_root;
531         struct btrfs_key root_key;
532         struct btrfs_key key;
533
534         root_key.objectid = root;
535         root_key.type = BTRFS_ROOT_ITEM_KEY;
536         root_key.offset = (u64)-1;
537         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
538         if (IS_ERR(local_root)) {
539                 ret = PTR_ERR(local_root);
540                 goto err;
541         }
542
543         /*
544          * this makes the path point to (inum INODE_ITEM ioff)
545          */
546         key.objectid = inum;
547         key.type = BTRFS_INODE_ITEM_KEY;
548         key.offset = 0;
549
550         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
551         if (ret) {
552                 btrfs_release_path(swarn->path);
553                 goto err;
554         }
555
556         eb = swarn->path->nodes[0];
557         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
558                                         struct btrfs_inode_item);
559         isize = btrfs_inode_size(eb, inode_item);
560         nlink = btrfs_inode_nlink(eb, inode_item);
561         btrfs_release_path(swarn->path);
562
563         ipath = init_ipath(4096, local_root, swarn->path);
564         if (IS_ERR(ipath)) {
565                 ret = PTR_ERR(ipath);
566                 ipath = NULL;
567                 goto err;
568         }
569         ret = paths_from_inode(inum, ipath);
570
571         if (ret < 0)
572                 goto err;
573
574         /*
575          * we deliberately ignore the bit ipath might have been too small to
576          * hold all of the paths here
577          */
578         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
579                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
580                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
581                         "length %llu, links %u (path: %s)\n", swarn->errstr,
582                         swarn->logical, rcu_str_deref(swarn->dev->name),
583                         (unsigned long long)swarn->sector, root, inum, offset,
584                         min(isize - offset, (u64)PAGE_SIZE), nlink,
585                         (char *)(unsigned long)ipath->fspath->val[i]);
586
587         free_ipath(ipath);
588         return 0;
589
590 err:
591         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
592                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
593                 "resolving failed with ret=%d\n", swarn->errstr,
594                 swarn->logical, rcu_str_deref(swarn->dev->name),
595                 (unsigned long long)swarn->sector, root, inum, offset, ret);
596
597         free_ipath(ipath);
598         return 0;
599 }
600
601 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
602 {
603         struct btrfs_device *dev;
604         struct btrfs_fs_info *fs_info;
605         struct btrfs_path *path;
606         struct btrfs_key found_key;
607         struct extent_buffer *eb;
608         struct btrfs_extent_item *ei;
609         struct scrub_warning swarn;
610         unsigned long ptr = 0;
611         u64 extent_item_pos;
612         u64 flags = 0;
613         u64 ref_root;
614         u32 item_size;
615         u8 ref_level;
616         int ret;
617
618         WARN_ON(sblock->page_count < 1);
619         dev = sblock->pagev[0]->dev;
620         fs_info = sblock->sctx->dev_root->fs_info;
621
622         path = btrfs_alloc_path();
623         if (!path)
624                 return;
625
626         swarn.sector = (sblock->pagev[0]->physical) >> 9;
627         swarn.logical = sblock->pagev[0]->logical;
628         swarn.errstr = errstr;
629         swarn.dev = NULL;
630
631         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
632                                   &flags);
633         if (ret < 0)
634                 goto out;
635
636         extent_item_pos = swarn.logical - found_key.objectid;
637         swarn.extent_item_size = found_key.offset;
638
639         eb = path->nodes[0];
640         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
641         item_size = btrfs_item_size_nr(eb, path->slots[0]);
642
643         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
644                 do {
645                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
646                                                       item_size, &ref_root,
647                                                       &ref_level);
648                         printk_in_rcu(KERN_WARNING
649                                 "BTRFS: %s at logical %llu on dev %s, "
650                                 "sector %llu: metadata %s (level %d) in tree "
651                                 "%llu\n", errstr, swarn.logical,
652                                 rcu_str_deref(dev->name),
653                                 (unsigned long long)swarn.sector,
654                                 ref_level ? "node" : "leaf",
655                                 ret < 0 ? -1 : ref_level,
656                                 ret < 0 ? -1 : ref_root);
657                 } while (ret != 1);
658                 btrfs_release_path(path);
659         } else {
660                 btrfs_release_path(path);
661                 swarn.path = path;
662                 swarn.dev = dev;
663                 iterate_extent_inodes(fs_info, found_key.objectid,
664                                         extent_item_pos, 1,
665                                         scrub_print_warning_inode, &swarn);
666         }
667
668 out:
669         btrfs_free_path(path);
670 }
671
672 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
673 {
674         struct page *page = NULL;
675         unsigned long index;
676         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
677         int ret;
678         int corrected = 0;
679         struct btrfs_key key;
680         struct inode *inode = NULL;
681         struct btrfs_fs_info *fs_info;
682         u64 end = offset + PAGE_SIZE - 1;
683         struct btrfs_root *local_root;
684         int srcu_index;
685
686         key.objectid = root;
687         key.type = BTRFS_ROOT_ITEM_KEY;
688         key.offset = (u64)-1;
689
690         fs_info = fixup->root->fs_info;
691         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
692
693         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
694         if (IS_ERR(local_root)) {
695                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
696                 return PTR_ERR(local_root);
697         }
698
699         key.type = BTRFS_INODE_ITEM_KEY;
700         key.objectid = inum;
701         key.offset = 0;
702         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
703         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
704         if (IS_ERR(inode))
705                 return PTR_ERR(inode);
706
707         index = offset >> PAGE_CACHE_SHIFT;
708
709         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
710         if (!page) {
711                 ret = -ENOMEM;
712                 goto out;
713         }
714
715         if (PageUptodate(page)) {
716                 if (PageDirty(page)) {
717                         /*
718                          * we need to write the data to the defect sector. the
719                          * data that was in that sector is not in memory,
720                          * because the page was modified. we must not write the
721                          * modified page to that sector.
722                          *
723                          * TODO: what could be done here: wait for the delalloc
724                          *       runner to write out that page (might involve
725                          *       COW) and see whether the sector is still
726                          *       referenced afterwards.
727                          *
728                          * For the meantime, we'll treat this error
729                          * incorrectable, although there is a chance that a
730                          * later scrub will find the bad sector again and that
731                          * there's no dirty page in memory, then.
732                          */
733                         ret = -EIO;
734                         goto out;
735                 }
736                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
737                                         fixup->logical, page,
738                                         offset - page_offset(page),
739                                         fixup->mirror_num);
740                 unlock_page(page);
741                 corrected = !ret;
742         } else {
743                 /*
744                  * we need to get good data first. the general readpage path
745                  * will call repair_io_failure for us, we just have to make
746                  * sure we read the bad mirror.
747                  */
748                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
749                                         EXTENT_DAMAGED, GFP_NOFS);
750                 if (ret) {
751                         /* set_extent_bits should give proper error */
752                         WARN_ON(ret > 0);
753                         if (ret > 0)
754                                 ret = -EFAULT;
755                         goto out;
756                 }
757
758                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
759                                                 btrfs_get_extent,
760                                                 fixup->mirror_num);
761                 wait_on_page_locked(page);
762
763                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
764                                                 end, EXTENT_DAMAGED, 0, NULL);
765                 if (!corrected)
766                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
767                                                 EXTENT_DAMAGED, GFP_NOFS);
768         }
769
770 out:
771         if (page)
772                 put_page(page);
773
774         iput(inode);
775
776         if (ret < 0)
777                 return ret;
778
779         if (ret == 0 && corrected) {
780                 /*
781                  * we only need to call readpage for one of the inodes belonging
782                  * to this extent. so make iterate_extent_inodes stop
783                  */
784                 return 1;
785         }
786
787         return -EIO;
788 }
789
790 static void scrub_fixup_nodatasum(struct btrfs_work *work)
791 {
792         int ret;
793         struct scrub_fixup_nodatasum *fixup;
794         struct scrub_ctx *sctx;
795         struct btrfs_trans_handle *trans = NULL;
796         struct btrfs_path *path;
797         int uncorrectable = 0;
798
799         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
800         sctx = fixup->sctx;
801
802         path = btrfs_alloc_path();
803         if (!path) {
804                 spin_lock(&sctx->stat_lock);
805                 ++sctx->stat.malloc_errors;
806                 spin_unlock(&sctx->stat_lock);
807                 uncorrectable = 1;
808                 goto out;
809         }
810
811         trans = btrfs_join_transaction(fixup->root);
812         if (IS_ERR(trans)) {
813                 uncorrectable = 1;
814                 goto out;
815         }
816
817         /*
818          * the idea is to trigger a regular read through the standard path. we
819          * read a page from the (failed) logical address by specifying the
820          * corresponding copynum of the failed sector. thus, that readpage is
821          * expected to fail.
822          * that is the point where on-the-fly error correction will kick in
823          * (once it's finished) and rewrite the failed sector if a good copy
824          * can be found.
825          */
826         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
827                                                 path, scrub_fixup_readpage,
828                                                 fixup);
829         if (ret < 0) {
830                 uncorrectable = 1;
831                 goto out;
832         }
833         WARN_ON(ret != 1);
834
835         spin_lock(&sctx->stat_lock);
836         ++sctx->stat.corrected_errors;
837         spin_unlock(&sctx->stat_lock);
838
839 out:
840         if (trans && !IS_ERR(trans))
841                 btrfs_end_transaction(trans, fixup->root);
842         if (uncorrectable) {
843                 spin_lock(&sctx->stat_lock);
844                 ++sctx->stat.uncorrectable_errors;
845                 spin_unlock(&sctx->stat_lock);
846                 btrfs_dev_replace_stats_inc(
847                         &sctx->dev_root->fs_info->dev_replace.
848                         num_uncorrectable_read_errors);
849                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
850                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
851                         fixup->logical, rcu_str_deref(fixup->dev->name));
852         }
853
854         btrfs_free_path(path);
855         kfree(fixup);
856
857         scrub_pending_trans_workers_dec(sctx);
858 }
859
860 static inline void scrub_get_recover(struct scrub_recover *recover)
861 {
862         atomic_inc(&recover->refs);
863 }
864
865 static inline void scrub_put_recover(struct scrub_recover *recover)
866 {
867         if (atomic_dec_and_test(&recover->refs)) {
868                 btrfs_put_bbio(recover->bbio);
869                 kfree(recover);
870         }
871 }
872
873 /*
874  * scrub_handle_errored_block gets called when either verification of the
875  * pages failed or the bio failed to read, e.g. with EIO. In the latter
876  * case, this function handles all pages in the bio, even though only one
877  * may be bad.
878  * The goal of this function is to repair the errored block by using the
879  * contents of one of the mirrors.
880  */
881 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
882 {
883         struct scrub_ctx *sctx = sblock_to_check->sctx;
884         struct btrfs_device *dev;
885         struct btrfs_fs_info *fs_info;
886         u64 length;
887         u64 logical;
888         u64 generation;
889         unsigned int failed_mirror_index;
890         unsigned int is_metadata;
891         unsigned int have_csum;
892         u8 *csum;
893         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
894         struct scrub_block *sblock_bad;
895         int ret;
896         int mirror_index;
897         int page_num;
898         int success;
899         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
900                                       DEFAULT_RATELIMIT_BURST);
901
902         BUG_ON(sblock_to_check->page_count < 1);
903         fs_info = sctx->dev_root->fs_info;
904         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
905                 /*
906                  * if we find an error in a super block, we just report it.
907                  * They will get written with the next transaction commit
908                  * anyway
909                  */
910                 spin_lock(&sctx->stat_lock);
911                 ++sctx->stat.super_errors;
912                 spin_unlock(&sctx->stat_lock);
913                 return 0;
914         }
915         length = sblock_to_check->page_count * PAGE_SIZE;
916         logical = sblock_to_check->pagev[0]->logical;
917         generation = sblock_to_check->pagev[0]->generation;
918         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
919         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
920         is_metadata = !(sblock_to_check->pagev[0]->flags &
921                         BTRFS_EXTENT_FLAG_DATA);
922         have_csum = sblock_to_check->pagev[0]->have_csum;
923         csum = sblock_to_check->pagev[0]->csum;
924         dev = sblock_to_check->pagev[0]->dev;
925
926         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
927                 sblocks_for_recheck = NULL;
928                 goto nodatasum_case;
929         }
930
931         /*
932          * read all mirrors one after the other. This includes to
933          * re-read the extent or metadata block that failed (that was
934          * the cause that this fixup code is called) another time,
935          * page by page this time in order to know which pages
936          * caused I/O errors and which ones are good (for all mirrors).
937          * It is the goal to handle the situation when more than one
938          * mirror contains I/O errors, but the errors do not
939          * overlap, i.e. the data can be repaired by selecting the
940          * pages from those mirrors without I/O error on the
941          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
942          * would be that mirror #1 has an I/O error on the first page,
943          * the second page is good, and mirror #2 has an I/O error on
944          * the second page, but the first page is good.
945          * Then the first page of the first mirror can be repaired by
946          * taking the first page of the second mirror, and the
947          * second page of the second mirror can be repaired by
948          * copying the contents of the 2nd page of the 1st mirror.
949          * One more note: if the pages of one mirror contain I/O
950          * errors, the checksum cannot be verified. In order to get
951          * the best data for repairing, the first attempt is to find
952          * a mirror without I/O errors and with a validated checksum.
953          * Only if this is not possible, the pages are picked from
954          * mirrors with I/O errors without considering the checksum.
955          * If the latter is the case, at the end, the checksum of the
956          * repaired area is verified in order to correctly maintain
957          * the statistics.
958          */
959
960         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
961                                      sizeof(*sblocks_for_recheck),
962                                      GFP_NOFS);
963         if (!sblocks_for_recheck) {
964                 spin_lock(&sctx->stat_lock);
965                 sctx->stat.malloc_errors++;
966                 sctx->stat.read_errors++;
967                 sctx->stat.uncorrectable_errors++;
968                 spin_unlock(&sctx->stat_lock);
969                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
970                 goto out;
971         }
972
973         /* setup the context, map the logical blocks and alloc the pages */
974         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
975         if (ret) {
976                 spin_lock(&sctx->stat_lock);
977                 sctx->stat.read_errors++;
978                 sctx->stat.uncorrectable_errors++;
979                 spin_unlock(&sctx->stat_lock);
980                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
981                 goto out;
982         }
983         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
984         sblock_bad = sblocks_for_recheck + failed_mirror_index;
985
986         /* build and submit the bios for the failed mirror, check checksums */
987         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
988                             csum, generation, sctx->csum_size, 1);
989
990         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
991             sblock_bad->no_io_error_seen) {
992                 /*
993                  * the error disappeared after reading page by page, or
994                  * the area was part of a huge bio and other parts of the
995                  * bio caused I/O errors, or the block layer merged several
996                  * read requests into one and the error is caused by a
997                  * different bio (usually one of the two latter cases is
998                  * the cause)
999                  */
1000                 spin_lock(&sctx->stat_lock);
1001                 sctx->stat.unverified_errors++;
1002                 sblock_to_check->data_corrected = 1;
1003                 spin_unlock(&sctx->stat_lock);
1004
1005                 if (sctx->is_dev_replace)
1006                         scrub_write_block_to_dev_replace(sblock_bad);
1007                 goto out;
1008         }
1009
1010         if (!sblock_bad->no_io_error_seen) {
1011                 spin_lock(&sctx->stat_lock);
1012                 sctx->stat.read_errors++;
1013                 spin_unlock(&sctx->stat_lock);
1014                 if (__ratelimit(&_rs))
1015                         scrub_print_warning("i/o error", sblock_to_check);
1016                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1017         } else if (sblock_bad->checksum_error) {
1018                 spin_lock(&sctx->stat_lock);
1019                 sctx->stat.csum_errors++;
1020                 spin_unlock(&sctx->stat_lock);
1021                 if (__ratelimit(&_rs))
1022                         scrub_print_warning("checksum error", sblock_to_check);
1023                 btrfs_dev_stat_inc_and_print(dev,
1024                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1025         } else if (sblock_bad->header_error) {
1026                 spin_lock(&sctx->stat_lock);
1027                 sctx->stat.verify_errors++;
1028                 spin_unlock(&sctx->stat_lock);
1029                 if (__ratelimit(&_rs))
1030                         scrub_print_warning("checksum/header error",
1031                                             sblock_to_check);
1032                 if (sblock_bad->generation_error)
1033                         btrfs_dev_stat_inc_and_print(dev,
1034                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1035                 else
1036                         btrfs_dev_stat_inc_and_print(dev,
1037                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1038         }
1039
1040         if (sctx->readonly) {
1041                 ASSERT(!sctx->is_dev_replace);
1042                 goto out;
1043         }
1044
1045         if (!is_metadata && !have_csum) {
1046                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1047
1048                 WARN_ON(sctx->is_dev_replace);
1049
1050 nodatasum_case:
1051
1052                 /*
1053                  * !is_metadata and !have_csum, this means that the data
1054                  * might not be COW'ed, that it might be modified
1055                  * concurrently. The general strategy to work on the
1056                  * commit root does not help in the case when COW is not
1057                  * used.
1058                  */
1059                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1060                 if (!fixup_nodatasum)
1061                         goto did_not_correct_error;
1062                 fixup_nodatasum->sctx = sctx;
1063                 fixup_nodatasum->dev = dev;
1064                 fixup_nodatasum->logical = logical;
1065                 fixup_nodatasum->root = fs_info->extent_root;
1066                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1067                 scrub_pending_trans_workers_inc(sctx);
1068                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1069                                 scrub_fixup_nodatasum, NULL, NULL);
1070                 btrfs_queue_work(fs_info->scrub_workers,
1071                                  &fixup_nodatasum->work);
1072                 goto out;
1073         }
1074
1075         /*
1076          * now build and submit the bios for the other mirrors, check
1077          * checksums.
1078          * First try to pick the mirror which is completely without I/O
1079          * errors and also does not have a checksum error.
1080          * If one is found, and if a checksum is present, the full block
1081          * that is known to contain an error is rewritten. Afterwards
1082          * the block is known to be corrected.
1083          * If a mirror is found which is completely correct, and no
1084          * checksum is present, only those pages are rewritten that had
1085          * an I/O error in the block to be repaired, since it cannot be
1086          * determined, which copy of the other pages is better (and it
1087          * could happen otherwise that a correct page would be
1088          * overwritten by a bad one).
1089          */
1090         for (mirror_index = 0;
1091              mirror_index < BTRFS_MAX_MIRRORS &&
1092              sblocks_for_recheck[mirror_index].page_count > 0;
1093              mirror_index++) {
1094                 struct scrub_block *sblock_other;
1095
1096                 if (mirror_index == failed_mirror_index)
1097                         continue;
1098                 sblock_other = sblocks_for_recheck + mirror_index;
1099
1100                 /* build and submit the bios, check checksums */
1101                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1102                                     have_csum, csum, generation,
1103                                     sctx->csum_size, 0);
1104
1105                 if (!sblock_other->header_error &&
1106                     !sblock_other->checksum_error &&
1107                     sblock_other->no_io_error_seen) {
1108                         if (sctx->is_dev_replace) {
1109                                 scrub_write_block_to_dev_replace(sblock_other);
1110                                 goto corrected_error;
1111                         } else {
1112                                 ret = scrub_repair_block_from_good_copy(
1113                                                 sblock_bad, sblock_other);
1114                                 if (!ret)
1115                                         goto corrected_error;
1116                         }
1117                 }
1118         }
1119
1120         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1121                 goto did_not_correct_error;
1122
1123         /*
1124          * In case of I/O errors in the area that is supposed to be
1125          * repaired, continue by picking good copies of those pages.
1126          * Select the good pages from mirrors to rewrite bad pages from
1127          * the area to fix. Afterwards verify the checksum of the block
1128          * that is supposed to be repaired. This verification step is
1129          * only done for the purpose of statistic counting and for the
1130          * final scrub report, whether errors remain.
1131          * A perfect algorithm could make use of the checksum and try
1132          * all possible combinations of pages from the different mirrors
1133          * until the checksum verification succeeds. For example, when
1134          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1135          * of mirror #2 is readable but the final checksum test fails,
1136          * then the 2nd page of mirror #3 could be tried, whether now
1137          * the final checksum succeedes. But this would be a rare
1138          * exception and is therefore not implemented. At least it is
1139          * avoided that the good copy is overwritten.
1140          * A more useful improvement would be to pick the sectors
1141          * without I/O error based on sector sizes (512 bytes on legacy
1142          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1143          * mirror could be repaired by taking 512 byte of a different
1144          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1145          * area are unreadable.
1146          */
1147         success = 1;
1148         for (page_num = 0; page_num < sblock_bad->page_count;
1149              page_num++) {
1150                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1151                 struct scrub_block *sblock_other = NULL;
1152
1153                 /* skip no-io-error page in scrub */
1154                 if (!page_bad->io_error && !sctx->is_dev_replace)
1155                         continue;
1156
1157                 /* try to find no-io-error page in mirrors */
1158                 if (page_bad->io_error) {
1159                         for (mirror_index = 0;
1160                              mirror_index < BTRFS_MAX_MIRRORS &&
1161                              sblocks_for_recheck[mirror_index].page_count > 0;
1162                              mirror_index++) {
1163                                 if (!sblocks_for_recheck[mirror_index].
1164                                     pagev[page_num]->io_error) {
1165                                         sblock_other = sblocks_for_recheck +
1166                                                        mirror_index;
1167                                         break;
1168                                 }
1169                         }
1170                         if (!sblock_other)
1171                                 success = 0;
1172                 }
1173
1174                 if (sctx->is_dev_replace) {
1175                         /*
1176                          * did not find a mirror to fetch the page
1177                          * from. scrub_write_page_to_dev_replace()
1178                          * handles this case (page->io_error), by
1179                          * filling the block with zeros before
1180                          * submitting the write request
1181                          */
1182                         if (!sblock_other)
1183                                 sblock_other = sblock_bad;
1184
1185                         if (scrub_write_page_to_dev_replace(sblock_other,
1186                                                             page_num) != 0) {
1187                                 btrfs_dev_replace_stats_inc(
1188                                         &sctx->dev_root->
1189                                         fs_info->dev_replace.
1190                                         num_write_errors);
1191                                 success = 0;
1192                         }
1193                 } else if (sblock_other) {
1194                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1195                                                                sblock_other,
1196                                                                page_num, 0);
1197                         if (0 == ret)
1198                                 page_bad->io_error = 0;
1199                         else
1200                                 success = 0;
1201                 }
1202         }
1203
1204         if (success && !sctx->is_dev_replace) {
1205                 if (is_metadata || have_csum) {
1206                         /*
1207                          * need to verify the checksum now that all
1208                          * sectors on disk are repaired (the write
1209                          * request for data to be repaired is on its way).
1210                          * Just be lazy and use scrub_recheck_block()
1211                          * which re-reads the data before the checksum
1212                          * is verified, but most likely the data comes out
1213                          * of the page cache.
1214                          */
1215                         scrub_recheck_block(fs_info, sblock_bad,
1216                                             is_metadata, have_csum, csum,
1217                                             generation, sctx->csum_size, 1);
1218                         if (!sblock_bad->header_error &&
1219                             !sblock_bad->checksum_error &&
1220                             sblock_bad->no_io_error_seen)
1221                                 goto corrected_error;
1222                         else
1223                                 goto did_not_correct_error;
1224                 } else {
1225 corrected_error:
1226                         spin_lock(&sctx->stat_lock);
1227                         sctx->stat.corrected_errors++;
1228                         sblock_to_check->data_corrected = 1;
1229                         spin_unlock(&sctx->stat_lock);
1230                         printk_ratelimited_in_rcu(KERN_ERR
1231                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1232                                 logical, rcu_str_deref(dev->name));
1233                 }
1234         } else {
1235 did_not_correct_error:
1236                 spin_lock(&sctx->stat_lock);
1237                 sctx->stat.uncorrectable_errors++;
1238                 spin_unlock(&sctx->stat_lock);
1239                 printk_ratelimited_in_rcu(KERN_ERR
1240                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1241                         logical, rcu_str_deref(dev->name));
1242         }
1243
1244 out:
1245         if (sblocks_for_recheck) {
1246                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1247                      mirror_index++) {
1248                         struct scrub_block *sblock = sblocks_for_recheck +
1249                                                      mirror_index;
1250                         struct scrub_recover *recover;
1251                         int page_index;
1252
1253                         for (page_index = 0; page_index < sblock->page_count;
1254                              page_index++) {
1255                                 sblock->pagev[page_index]->sblock = NULL;
1256                                 recover = sblock->pagev[page_index]->recover;
1257                                 if (recover) {
1258                                         scrub_put_recover(recover);
1259                                         sblock->pagev[page_index]->recover =
1260                                                                         NULL;
1261                                 }
1262                                 scrub_page_put(sblock->pagev[page_index]);
1263                         }
1264                 }
1265                 kfree(sblocks_for_recheck);
1266         }
1267
1268         return 0;
1269 }
1270
1271 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1272 {
1273         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1274                 return 2;
1275         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1276                 return 3;
1277         else
1278                 return (int)bbio->num_stripes;
1279 }
1280
1281 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1282                                                  u64 *raid_map,
1283                                                  u64 mapped_length,
1284                                                  int nstripes, int mirror,
1285                                                  int *stripe_index,
1286                                                  u64 *stripe_offset)
1287 {
1288         int i;
1289
1290         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1291                 /* RAID5/6 */
1292                 for (i = 0; i < nstripes; i++) {
1293                         if (raid_map[i] == RAID6_Q_STRIPE ||
1294                             raid_map[i] == RAID5_P_STRIPE)
1295                                 continue;
1296
1297                         if (logical >= raid_map[i] &&
1298                             logical < raid_map[i] + mapped_length)
1299                                 break;
1300                 }
1301
1302                 *stripe_index = i;
1303                 *stripe_offset = logical - raid_map[i];
1304         } else {
1305                 /* The other RAID type */
1306                 *stripe_index = mirror;
1307                 *stripe_offset = 0;
1308         }
1309 }
1310
1311 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1312                                      struct scrub_block *sblocks_for_recheck)
1313 {
1314         struct scrub_ctx *sctx = original_sblock->sctx;
1315         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1316         u64 length = original_sblock->page_count * PAGE_SIZE;
1317         u64 logical = original_sblock->pagev[0]->logical;
1318         struct scrub_recover *recover;
1319         struct btrfs_bio *bbio;
1320         u64 sublen;
1321         u64 mapped_length;
1322         u64 stripe_offset;
1323         int stripe_index;
1324         int page_index = 0;
1325         int mirror_index;
1326         int nmirrors;
1327         int ret;
1328
1329         /*
1330          * note: the two members refs and outstanding_pages
1331          * are not used (and not set) in the blocks that are used for
1332          * the recheck procedure
1333          */
1334
1335         while (length > 0) {
1336                 sublen = min_t(u64, length, PAGE_SIZE);
1337                 mapped_length = sublen;
1338                 bbio = NULL;
1339
1340                 /*
1341                  * with a length of PAGE_SIZE, each returned stripe
1342                  * represents one mirror
1343                  */
1344                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1345                                        &mapped_length, &bbio, 0, 1);
1346                 if (ret || !bbio || mapped_length < sublen) {
1347                         btrfs_put_bbio(bbio);
1348                         return -EIO;
1349                 }
1350
1351                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1352                 if (!recover) {
1353                         btrfs_put_bbio(bbio);
1354                         return -ENOMEM;
1355                 }
1356
1357                 atomic_set(&recover->refs, 1);
1358                 recover->bbio = bbio;
1359                 recover->map_length = mapped_length;
1360
1361                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1362
1363                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1364
1365                 for (mirror_index = 0; mirror_index < nmirrors;
1366                      mirror_index++) {
1367                         struct scrub_block *sblock;
1368                         struct scrub_page *page;
1369
1370                         sblock = sblocks_for_recheck + mirror_index;
1371                         sblock->sctx = sctx;
1372                         page = kzalloc(sizeof(*page), GFP_NOFS);
1373                         if (!page) {
1374 leave_nomem:
1375                                 spin_lock(&sctx->stat_lock);
1376                                 sctx->stat.malloc_errors++;
1377                                 spin_unlock(&sctx->stat_lock);
1378                                 scrub_put_recover(recover);
1379                                 return -ENOMEM;
1380                         }
1381                         scrub_page_get(page);
1382                         sblock->pagev[page_index] = page;
1383                         page->logical = logical;
1384
1385                         scrub_stripe_index_and_offset(logical,
1386                                                       bbio->map_type,
1387                                                       bbio->raid_map,
1388                                                       mapped_length,
1389                                                       bbio->num_stripes -
1390                                                       bbio->num_tgtdevs,
1391                                                       mirror_index,
1392                                                       &stripe_index,
1393                                                       &stripe_offset);
1394                         page->physical = bbio->stripes[stripe_index].physical +
1395                                          stripe_offset;
1396                         page->dev = bbio->stripes[stripe_index].dev;
1397
1398                         BUG_ON(page_index >= original_sblock->page_count);
1399                         page->physical_for_dev_replace =
1400                                 original_sblock->pagev[page_index]->
1401                                 physical_for_dev_replace;
1402                         /* for missing devices, dev->bdev is NULL */
1403                         page->mirror_num = mirror_index + 1;
1404                         sblock->page_count++;
1405                         page->page = alloc_page(GFP_NOFS);
1406                         if (!page->page)
1407                                 goto leave_nomem;
1408
1409                         scrub_get_recover(recover);
1410                         page->recover = recover;
1411                 }
1412                 scrub_put_recover(recover);
1413                 length -= sublen;
1414                 logical += sublen;
1415                 page_index++;
1416         }
1417
1418         return 0;
1419 }
1420
1421 struct scrub_bio_ret {
1422         struct completion event;
1423         int error;
1424 };
1425
1426 static void scrub_bio_wait_endio(struct bio *bio, int error)
1427 {
1428         struct scrub_bio_ret *ret = bio->bi_private;
1429
1430         ret->error = error;
1431         complete(&ret->event);
1432 }
1433
1434 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1435 {
1436         return page->recover &&
1437                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1438 }
1439
1440 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1441                                         struct bio *bio,
1442                                         struct scrub_page *page)
1443 {
1444         struct scrub_bio_ret done;
1445         int ret;
1446
1447         init_completion(&done.event);
1448         done.error = 0;
1449         bio->bi_iter.bi_sector = page->logical >> 9;
1450         bio->bi_private = &done;
1451         bio->bi_end_io = scrub_bio_wait_endio;
1452
1453         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1454                                     page->recover->map_length,
1455                                     page->mirror_num, 0);
1456         if (ret)
1457                 return ret;
1458
1459         wait_for_completion(&done.event);
1460         if (done.error)
1461                 return -EIO;
1462
1463         return 0;
1464 }
1465
1466 /*
1467  * this function will check the on disk data for checksum errors, header
1468  * errors and read I/O errors. If any I/O errors happen, the exact pages
1469  * which are errored are marked as being bad. The goal is to enable scrub
1470  * to take those pages that are not errored from all the mirrors so that
1471  * the pages that are errored in the just handled mirror can be repaired.
1472  */
1473 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1474                                 struct scrub_block *sblock, int is_metadata,
1475                                 int have_csum, u8 *csum, u64 generation,
1476                                 u16 csum_size, int retry_failed_mirror)
1477 {
1478         int page_num;
1479
1480         sblock->no_io_error_seen = 1;
1481         sblock->header_error = 0;
1482         sblock->checksum_error = 0;
1483
1484         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1485                 struct bio *bio;
1486                 struct scrub_page *page = sblock->pagev[page_num];
1487
1488                 if (page->dev->bdev == NULL) {
1489                         page->io_error = 1;
1490                         sblock->no_io_error_seen = 0;
1491                         continue;
1492                 }
1493
1494                 WARN_ON(!page->page);
1495                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1496                 if (!bio) {
1497                         page->io_error = 1;
1498                         sblock->no_io_error_seen = 0;
1499                         continue;
1500                 }
1501                 bio->bi_bdev = page->dev->bdev;
1502
1503                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1504                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1505                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1506                                 sblock->no_io_error_seen = 0;
1507                 } else {
1508                         bio->bi_iter.bi_sector = page->physical >> 9;
1509
1510                         if (btrfsic_submit_bio_wait(READ, bio))
1511                                 sblock->no_io_error_seen = 0;
1512                 }
1513
1514                 bio_put(bio);
1515         }
1516
1517         if (sblock->no_io_error_seen)
1518                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1519                                              have_csum, csum, generation,
1520                                              csum_size);
1521
1522         return;
1523 }
1524
1525 static inline int scrub_check_fsid(u8 fsid[],
1526                                    struct scrub_page *spage)
1527 {
1528         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1529         int ret;
1530
1531         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1532         return !ret;
1533 }
1534
1535 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1536                                          struct scrub_block *sblock,
1537                                          int is_metadata, int have_csum,
1538                                          const u8 *csum, u64 generation,
1539                                          u16 csum_size)
1540 {
1541         int page_num;
1542         u8 calculated_csum[BTRFS_CSUM_SIZE];
1543         u32 crc = ~(u32)0;
1544         void *mapped_buffer;
1545
1546         WARN_ON(!sblock->pagev[0]->page);
1547         if (is_metadata) {
1548                 struct btrfs_header *h;
1549
1550                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1551                 h = (struct btrfs_header *)mapped_buffer;
1552
1553                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1554                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1555                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1556                            BTRFS_UUID_SIZE)) {
1557                         sblock->header_error = 1;
1558                 } else if (generation != btrfs_stack_header_generation(h)) {
1559                         sblock->header_error = 1;
1560                         sblock->generation_error = 1;
1561                 }
1562                 csum = h->csum;
1563         } else {
1564                 if (!have_csum)
1565                         return;
1566
1567                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1568         }
1569
1570         for (page_num = 0;;) {
1571                 if (page_num == 0 && is_metadata)
1572                         crc = btrfs_csum_data(
1573                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1574                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1575                 else
1576                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1577
1578                 kunmap_atomic(mapped_buffer);
1579                 page_num++;
1580                 if (page_num >= sblock->page_count)
1581                         break;
1582                 WARN_ON(!sblock->pagev[page_num]->page);
1583
1584                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1585         }
1586
1587         btrfs_csum_final(crc, calculated_csum);
1588         if (memcmp(calculated_csum, csum, csum_size))
1589                 sblock->checksum_error = 1;
1590 }
1591
1592 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1593                                              struct scrub_block *sblock_good)
1594 {
1595         int page_num;
1596         int ret = 0;
1597
1598         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1599                 int ret_sub;
1600
1601                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1602                                                            sblock_good,
1603                                                            page_num, 1);
1604                 if (ret_sub)
1605                         ret = ret_sub;
1606         }
1607
1608         return ret;
1609 }
1610
1611 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1612                                             struct scrub_block *sblock_good,
1613                                             int page_num, int force_write)
1614 {
1615         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1616         struct scrub_page *page_good = sblock_good->pagev[page_num];
1617
1618         BUG_ON(page_bad->page == NULL);
1619         BUG_ON(page_good->page == NULL);
1620         if (force_write || sblock_bad->header_error ||
1621             sblock_bad->checksum_error || page_bad->io_error) {
1622                 struct bio *bio;
1623                 int ret;
1624
1625                 if (!page_bad->dev->bdev) {
1626                         printk_ratelimited(KERN_WARNING "BTRFS: "
1627                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1628                                 "is unexpected!\n");
1629                         return -EIO;
1630                 }
1631
1632                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1633                 if (!bio)
1634                         return -EIO;
1635                 bio->bi_bdev = page_bad->dev->bdev;
1636                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1637
1638                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1639                 if (PAGE_SIZE != ret) {
1640                         bio_put(bio);
1641                         return -EIO;
1642                 }
1643
1644                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1645                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1646                                 BTRFS_DEV_STAT_WRITE_ERRS);
1647                         btrfs_dev_replace_stats_inc(
1648                                 &sblock_bad->sctx->dev_root->fs_info->
1649                                 dev_replace.num_write_errors);
1650                         bio_put(bio);
1651                         return -EIO;
1652                 }
1653                 bio_put(bio);
1654         }
1655
1656         return 0;
1657 }
1658
1659 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1660 {
1661         int page_num;
1662
1663         /*
1664          * This block is used for the check of the parity on the source device,
1665          * so the data needn't be written into the destination device.
1666          */
1667         if (sblock->sparity)
1668                 return;
1669
1670         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1671                 int ret;
1672
1673                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1674                 if (ret)
1675                         btrfs_dev_replace_stats_inc(
1676                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1677                                 num_write_errors);
1678         }
1679 }
1680
1681 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1682                                            int page_num)
1683 {
1684         struct scrub_page *spage = sblock->pagev[page_num];
1685
1686         BUG_ON(spage->page == NULL);
1687         if (spage->io_error) {
1688                 void *mapped_buffer = kmap_atomic(spage->page);
1689
1690                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1691                 flush_dcache_page(spage->page);
1692                 kunmap_atomic(mapped_buffer);
1693         }
1694         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1695 }
1696
1697 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1698                                     struct scrub_page *spage)
1699 {
1700         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1701         struct scrub_bio *sbio;
1702         int ret;
1703
1704         mutex_lock(&wr_ctx->wr_lock);
1705 again:
1706         if (!wr_ctx->wr_curr_bio) {
1707                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1708                                               GFP_NOFS);
1709                 if (!wr_ctx->wr_curr_bio) {
1710                         mutex_unlock(&wr_ctx->wr_lock);
1711                         return -ENOMEM;
1712                 }
1713                 wr_ctx->wr_curr_bio->sctx = sctx;
1714                 wr_ctx->wr_curr_bio->page_count = 0;
1715         }
1716         sbio = wr_ctx->wr_curr_bio;
1717         if (sbio->page_count == 0) {
1718                 struct bio *bio;
1719
1720                 sbio->physical = spage->physical_for_dev_replace;
1721                 sbio->logical = spage->logical;
1722                 sbio->dev = wr_ctx->tgtdev;
1723                 bio = sbio->bio;
1724                 if (!bio) {
1725                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1726                         if (!bio) {
1727                                 mutex_unlock(&wr_ctx->wr_lock);
1728                                 return -ENOMEM;
1729                         }
1730                         sbio->bio = bio;
1731                 }
1732
1733                 bio->bi_private = sbio;
1734                 bio->bi_end_io = scrub_wr_bio_end_io;
1735                 bio->bi_bdev = sbio->dev->bdev;
1736                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1737                 sbio->err = 0;
1738         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1739                    spage->physical_for_dev_replace ||
1740                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1741                    spage->logical) {
1742                 scrub_wr_submit(sctx);
1743                 goto again;
1744         }
1745
1746         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1747         if (ret != PAGE_SIZE) {
1748                 if (sbio->page_count < 1) {
1749                         bio_put(sbio->bio);
1750                         sbio->bio = NULL;
1751                         mutex_unlock(&wr_ctx->wr_lock);
1752                         return -EIO;
1753                 }
1754                 scrub_wr_submit(sctx);
1755                 goto again;
1756         }
1757
1758         sbio->pagev[sbio->page_count] = spage;
1759         scrub_page_get(spage);
1760         sbio->page_count++;
1761         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1762                 scrub_wr_submit(sctx);
1763         mutex_unlock(&wr_ctx->wr_lock);
1764
1765         return 0;
1766 }
1767
1768 static void scrub_wr_submit(struct scrub_ctx *sctx)
1769 {
1770         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1771         struct scrub_bio *sbio;
1772
1773         if (!wr_ctx->wr_curr_bio)
1774                 return;
1775
1776         sbio = wr_ctx->wr_curr_bio;
1777         wr_ctx->wr_curr_bio = NULL;
1778         WARN_ON(!sbio->bio->bi_bdev);
1779         scrub_pending_bio_inc(sctx);
1780         /* process all writes in a single worker thread. Then the block layer
1781          * orders the requests before sending them to the driver which
1782          * doubled the write performance on spinning disks when measured
1783          * with Linux 3.5 */
1784         btrfsic_submit_bio(WRITE, sbio->bio);
1785 }
1786
1787 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1788 {
1789         struct scrub_bio *sbio = bio->bi_private;
1790         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1791
1792         sbio->err = err;
1793         sbio->bio = bio;
1794
1795         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1796                          scrub_wr_bio_end_io_worker, NULL, NULL);
1797         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1798 }
1799
1800 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1801 {
1802         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1803         struct scrub_ctx *sctx = sbio->sctx;
1804         int i;
1805
1806         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1807         if (sbio->err) {
1808                 struct btrfs_dev_replace *dev_replace =
1809                         &sbio->sctx->dev_root->fs_info->dev_replace;
1810
1811                 for (i = 0; i < sbio->page_count; i++) {
1812                         struct scrub_page *spage = sbio->pagev[i];
1813
1814                         spage->io_error = 1;
1815                         btrfs_dev_replace_stats_inc(&dev_replace->
1816                                                     num_write_errors);
1817                 }
1818         }
1819
1820         for (i = 0; i < sbio->page_count; i++)
1821                 scrub_page_put(sbio->pagev[i]);
1822
1823         bio_put(sbio->bio);
1824         kfree(sbio);
1825         scrub_pending_bio_dec(sctx);
1826 }
1827
1828 static int scrub_checksum(struct scrub_block *sblock)
1829 {
1830         u64 flags;
1831         int ret;
1832
1833         WARN_ON(sblock->page_count < 1);
1834         flags = sblock->pagev[0]->flags;
1835         ret = 0;
1836         if (flags & BTRFS_EXTENT_FLAG_DATA)
1837                 ret = scrub_checksum_data(sblock);
1838         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1839                 ret = scrub_checksum_tree_block(sblock);
1840         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1841                 (void)scrub_checksum_super(sblock);
1842         else
1843                 WARN_ON(1);
1844         if (ret)
1845                 scrub_handle_errored_block(sblock);
1846
1847         return ret;
1848 }
1849
1850 static int scrub_checksum_data(struct scrub_block *sblock)
1851 {
1852         struct scrub_ctx *sctx = sblock->sctx;
1853         u8 csum[BTRFS_CSUM_SIZE];
1854         u8 *on_disk_csum;
1855         struct page *page;
1856         void *buffer;
1857         u32 crc = ~(u32)0;
1858         int fail = 0;
1859         u64 len;
1860         int index;
1861
1862         BUG_ON(sblock->page_count < 1);
1863         if (!sblock->pagev[0]->have_csum)
1864                 return 0;
1865
1866         on_disk_csum = sblock->pagev[0]->csum;
1867         page = sblock->pagev[0]->page;
1868         buffer = kmap_atomic(page);
1869
1870         len = sctx->sectorsize;
1871         index = 0;
1872         for (;;) {
1873                 u64 l = min_t(u64, len, PAGE_SIZE);
1874
1875                 crc = btrfs_csum_data(buffer, crc, l);
1876                 kunmap_atomic(buffer);
1877                 len -= l;
1878                 if (len == 0)
1879                         break;
1880                 index++;
1881                 BUG_ON(index >= sblock->page_count);
1882                 BUG_ON(!sblock->pagev[index]->page);
1883                 page = sblock->pagev[index]->page;
1884                 buffer = kmap_atomic(page);
1885         }
1886
1887         btrfs_csum_final(crc, csum);
1888         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1889                 fail = 1;
1890
1891         return fail;
1892 }
1893
1894 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1895 {
1896         struct scrub_ctx *sctx = sblock->sctx;
1897         struct btrfs_header *h;
1898         struct btrfs_root *root = sctx->dev_root;
1899         struct btrfs_fs_info *fs_info = root->fs_info;
1900         u8 calculated_csum[BTRFS_CSUM_SIZE];
1901         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1902         struct page *page;
1903         void *mapped_buffer;
1904         u64 mapped_size;
1905         void *p;
1906         u32 crc = ~(u32)0;
1907         int fail = 0;
1908         int crc_fail = 0;
1909         u64 len;
1910         int index;
1911
1912         BUG_ON(sblock->page_count < 1);
1913         page = sblock->pagev[0]->page;
1914         mapped_buffer = kmap_atomic(page);
1915         h = (struct btrfs_header *)mapped_buffer;
1916         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1917
1918         /*
1919          * we don't use the getter functions here, as we
1920          * a) don't have an extent buffer and
1921          * b) the page is already kmapped
1922          */
1923
1924         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1925                 ++fail;
1926
1927         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1928                 ++fail;
1929
1930         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1931                 ++fail;
1932
1933         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1934                    BTRFS_UUID_SIZE))
1935                 ++fail;
1936
1937         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1938         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1939         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1940         index = 0;
1941         for (;;) {
1942                 u64 l = min_t(u64, len, mapped_size);
1943
1944                 crc = btrfs_csum_data(p, crc, l);
1945                 kunmap_atomic(mapped_buffer);
1946                 len -= l;
1947                 if (len == 0)
1948                         break;
1949                 index++;
1950                 BUG_ON(index >= sblock->page_count);
1951                 BUG_ON(!sblock->pagev[index]->page);
1952                 page = sblock->pagev[index]->page;
1953                 mapped_buffer = kmap_atomic(page);
1954                 mapped_size = PAGE_SIZE;
1955                 p = mapped_buffer;
1956         }
1957
1958         btrfs_csum_final(crc, calculated_csum);
1959         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1960                 ++crc_fail;
1961
1962         return fail || crc_fail;
1963 }
1964
1965 static int scrub_checksum_super(struct scrub_block *sblock)
1966 {
1967         struct btrfs_super_block *s;
1968         struct scrub_ctx *sctx = sblock->sctx;
1969         u8 calculated_csum[BTRFS_CSUM_SIZE];
1970         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1971         struct page *page;
1972         void *mapped_buffer;
1973         u64 mapped_size;
1974         void *p;
1975         u32 crc = ~(u32)0;
1976         int fail_gen = 0;
1977         int fail_cor = 0;
1978         u64 len;
1979         int index;
1980
1981         BUG_ON(sblock->page_count < 1);
1982         page = sblock->pagev[0]->page;
1983         mapped_buffer = kmap_atomic(page);
1984         s = (struct btrfs_super_block *)mapped_buffer;
1985         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1986
1987         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1988                 ++fail_cor;
1989
1990         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1991                 ++fail_gen;
1992
1993         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1994                 ++fail_cor;
1995
1996         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1997         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1998         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1999         index = 0;
2000         for (;;) {
2001                 u64 l = min_t(u64, len, mapped_size);
2002
2003                 crc = btrfs_csum_data(p, crc, l);
2004                 kunmap_atomic(mapped_buffer);
2005                 len -= l;
2006                 if (len == 0)
2007                         break;
2008                 index++;
2009                 BUG_ON(index >= sblock->page_count);
2010                 BUG_ON(!sblock->pagev[index]->page);
2011                 page = sblock->pagev[index]->page;
2012                 mapped_buffer = kmap_atomic(page);
2013                 mapped_size = PAGE_SIZE;
2014                 p = mapped_buffer;
2015         }
2016
2017         btrfs_csum_final(crc, calculated_csum);
2018         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2019                 ++fail_cor;
2020
2021         if (fail_cor + fail_gen) {
2022                 /*
2023                  * if we find an error in a super block, we just report it.
2024                  * They will get written with the next transaction commit
2025                  * anyway
2026                  */
2027                 spin_lock(&sctx->stat_lock);
2028                 ++sctx->stat.super_errors;
2029                 spin_unlock(&sctx->stat_lock);
2030                 if (fail_cor)
2031                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2032                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2033                 else
2034                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2035                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2036         }
2037
2038         return fail_cor + fail_gen;
2039 }
2040
2041 static void scrub_block_get(struct scrub_block *sblock)
2042 {
2043         atomic_inc(&sblock->refs);
2044 }
2045
2046 static void scrub_block_put(struct scrub_block *sblock)
2047 {
2048         if (atomic_dec_and_test(&sblock->refs)) {
2049                 int i;
2050
2051                 if (sblock->sparity)
2052                         scrub_parity_put(sblock->sparity);
2053
2054                 for (i = 0; i < sblock->page_count; i++)
2055                         scrub_page_put(sblock->pagev[i]);
2056                 kfree(sblock);
2057         }
2058 }
2059
2060 static void scrub_page_get(struct scrub_page *spage)
2061 {
2062         atomic_inc(&spage->refs);
2063 }
2064
2065 static void scrub_page_put(struct scrub_page *spage)
2066 {
2067         if (atomic_dec_and_test(&spage->refs)) {
2068                 if (spage->page)
2069                         __free_page(spage->page);
2070                 kfree(spage);
2071         }
2072 }
2073
2074 static void scrub_submit(struct scrub_ctx *sctx)
2075 {
2076         struct scrub_bio *sbio;
2077
2078         if (sctx->curr == -1)
2079                 return;
2080
2081         sbio = sctx->bios[sctx->curr];
2082         sctx->curr = -1;
2083         scrub_pending_bio_inc(sctx);
2084
2085         if (!sbio->bio->bi_bdev) {
2086                 /*
2087                  * this case should not happen. If btrfs_map_block() is
2088                  * wrong, it could happen for dev-replace operations on
2089                  * missing devices when no mirrors are available, but in
2090                  * this case it should already fail the mount.
2091                  * This case is handled correctly (but _very_ slowly).
2092                  */
2093                 printk_ratelimited(KERN_WARNING
2094                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2095                 bio_endio(sbio->bio, -EIO);
2096         } else {
2097                 btrfsic_submit_bio(READ, sbio->bio);
2098         }
2099 }
2100
2101 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2102                                     struct scrub_page *spage)
2103 {
2104         struct scrub_block *sblock = spage->sblock;
2105         struct scrub_bio *sbio;
2106         int ret;
2107
2108 again:
2109         /*
2110          * grab a fresh bio or wait for one to become available
2111          */
2112         while (sctx->curr == -1) {
2113                 spin_lock(&sctx->list_lock);
2114                 sctx->curr = sctx->first_free;
2115                 if (sctx->curr != -1) {
2116                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2117                         sctx->bios[sctx->curr]->next_free = -1;
2118                         sctx->bios[sctx->curr]->page_count = 0;
2119                         spin_unlock(&sctx->list_lock);
2120                 } else {
2121                         spin_unlock(&sctx->list_lock);
2122                         wait_event(sctx->list_wait, sctx->first_free != -1);
2123                 }
2124         }
2125         sbio = sctx->bios[sctx->curr];
2126         if (sbio->page_count == 0) {
2127                 struct bio *bio;
2128
2129                 sbio->physical = spage->physical;
2130                 sbio->logical = spage->logical;
2131                 sbio->dev = spage->dev;
2132                 bio = sbio->bio;
2133                 if (!bio) {
2134                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2135                         if (!bio)
2136                                 return -ENOMEM;
2137                         sbio->bio = bio;
2138                 }
2139
2140                 bio->bi_private = sbio;
2141                 bio->bi_end_io = scrub_bio_end_io;
2142                 bio->bi_bdev = sbio->dev->bdev;
2143                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2144                 sbio->err = 0;
2145         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2146                    spage->physical ||
2147                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2148                    spage->logical ||
2149                    sbio->dev != spage->dev) {
2150                 scrub_submit(sctx);
2151                 goto again;
2152         }
2153
2154         sbio->pagev[sbio->page_count] = spage;
2155         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2156         if (ret != PAGE_SIZE) {
2157                 if (sbio->page_count < 1) {
2158                         bio_put(sbio->bio);
2159                         sbio->bio = NULL;
2160                         return -EIO;
2161                 }
2162                 scrub_submit(sctx);
2163                 goto again;
2164         }
2165
2166         scrub_block_get(sblock); /* one for the page added to the bio */
2167         atomic_inc(&sblock->outstanding_pages);
2168         sbio->page_count++;
2169         if (sbio->page_count == sctx->pages_per_rd_bio)
2170                 scrub_submit(sctx);
2171
2172         return 0;
2173 }
2174
2175 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2176                        u64 physical, struct btrfs_device *dev, u64 flags,
2177                        u64 gen, int mirror_num, u8 *csum, int force,
2178                        u64 physical_for_dev_replace)
2179 {
2180         struct scrub_block *sblock;
2181         int index;
2182
2183         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2184         if (!sblock) {
2185                 spin_lock(&sctx->stat_lock);
2186                 sctx->stat.malloc_errors++;
2187                 spin_unlock(&sctx->stat_lock);
2188                 return -ENOMEM;
2189         }
2190
2191         /* one ref inside this function, plus one for each page added to
2192          * a bio later on */
2193         atomic_set(&sblock->refs, 1);
2194         sblock->sctx = sctx;
2195         sblock->no_io_error_seen = 1;
2196
2197         for (index = 0; len > 0; index++) {
2198                 struct scrub_page *spage;
2199                 u64 l = min_t(u64, len, PAGE_SIZE);
2200
2201                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2202                 if (!spage) {
2203 leave_nomem:
2204                         spin_lock(&sctx->stat_lock);
2205                         sctx->stat.malloc_errors++;
2206                         spin_unlock(&sctx->stat_lock);
2207                         scrub_block_put(sblock);
2208                         return -ENOMEM;
2209                 }
2210                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2211                 scrub_page_get(spage);
2212                 sblock->pagev[index] = spage;
2213                 spage->sblock = sblock;
2214                 spage->dev = dev;
2215                 spage->flags = flags;
2216                 spage->generation = gen;
2217                 spage->logical = logical;
2218                 spage->physical = physical;
2219                 spage->physical_for_dev_replace = physical_for_dev_replace;
2220                 spage->mirror_num = mirror_num;
2221                 if (csum) {
2222                         spage->have_csum = 1;
2223                         memcpy(spage->csum, csum, sctx->csum_size);
2224                 } else {
2225                         spage->have_csum = 0;
2226                 }
2227                 sblock->page_count++;
2228                 spage->page = alloc_page(GFP_NOFS);
2229                 if (!spage->page)
2230                         goto leave_nomem;
2231                 len -= l;
2232                 logical += l;
2233                 physical += l;
2234                 physical_for_dev_replace += l;
2235         }
2236
2237         WARN_ON(sblock->page_count == 0);
2238         for (index = 0; index < sblock->page_count; index++) {
2239                 struct scrub_page *spage = sblock->pagev[index];
2240                 int ret;
2241
2242                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2243                 if (ret) {
2244                         scrub_block_put(sblock);
2245                         return ret;
2246                 }
2247         }
2248
2249         if (force)
2250                 scrub_submit(sctx);
2251
2252         /* last one frees, either here or in bio completion for last page */
2253         scrub_block_put(sblock);
2254         return 0;
2255 }
2256
2257 static void scrub_bio_end_io(struct bio *bio, int err)
2258 {
2259         struct scrub_bio *sbio = bio->bi_private;
2260         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2261
2262         sbio->err = err;
2263         sbio->bio = bio;
2264
2265         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2266 }
2267
2268 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2269 {
2270         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2271         struct scrub_ctx *sctx = sbio->sctx;
2272         int i;
2273
2274         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2275         if (sbio->err) {
2276                 for (i = 0; i < sbio->page_count; i++) {
2277                         struct scrub_page *spage = sbio->pagev[i];
2278
2279                         spage->io_error = 1;
2280                         spage->sblock->no_io_error_seen = 0;
2281                 }
2282         }
2283
2284         /* now complete the scrub_block items that have all pages completed */
2285         for (i = 0; i < sbio->page_count; i++) {
2286                 struct scrub_page *spage = sbio->pagev[i];
2287                 struct scrub_block *sblock = spage->sblock;
2288
2289                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2290                         scrub_block_complete(sblock);
2291                 scrub_block_put(sblock);
2292         }
2293
2294         bio_put(sbio->bio);
2295         sbio->bio = NULL;
2296         spin_lock(&sctx->list_lock);
2297         sbio->next_free = sctx->first_free;
2298         sctx->first_free = sbio->index;
2299         spin_unlock(&sctx->list_lock);
2300
2301         if (sctx->is_dev_replace &&
2302             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2303                 mutex_lock(&sctx->wr_ctx.wr_lock);
2304                 scrub_wr_submit(sctx);
2305                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2306         }
2307
2308         scrub_pending_bio_dec(sctx);
2309 }
2310
2311 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2312                                        unsigned long *bitmap,
2313                                        u64 start, u64 len)
2314 {
2315         int offset;
2316         int nsectors;
2317         int sectorsize = sparity->sctx->dev_root->sectorsize;
2318
2319         if (len >= sparity->stripe_len) {
2320                 bitmap_set(bitmap, 0, sparity->nsectors);
2321                 return;
2322         }
2323
2324         start -= sparity->logic_start;
2325         offset = (int)do_div(start, sparity->stripe_len);
2326         offset /= sectorsize;
2327         nsectors = (int)len / sectorsize;
2328
2329         if (offset + nsectors <= sparity->nsectors) {
2330                 bitmap_set(bitmap, offset, nsectors);
2331                 return;
2332         }
2333
2334         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2335         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2336 }
2337
2338 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2339                                                    u64 start, u64 len)
2340 {
2341         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2342 }
2343
2344 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2345                                                   u64 start, u64 len)
2346 {
2347         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2348 }
2349
2350 static void scrub_block_complete(struct scrub_block *sblock)
2351 {
2352         int corrupted = 0;
2353
2354         if (!sblock->no_io_error_seen) {
2355                 corrupted = 1;
2356                 scrub_handle_errored_block(sblock);
2357         } else {
2358                 /*
2359                  * if has checksum error, write via repair mechanism in
2360                  * dev replace case, otherwise write here in dev replace
2361                  * case.
2362                  */
2363                 corrupted = scrub_checksum(sblock);
2364                 if (!corrupted && sblock->sctx->is_dev_replace)
2365                         scrub_write_block_to_dev_replace(sblock);
2366         }
2367
2368         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2369                 u64 start = sblock->pagev[0]->logical;
2370                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2371                           PAGE_SIZE;
2372
2373                 scrub_parity_mark_sectors_error(sblock->sparity,
2374                                                 start, end - start);
2375         }
2376 }
2377
2378 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2379                            u8 *csum)
2380 {
2381         struct btrfs_ordered_sum *sum = NULL;
2382         unsigned long index;
2383         unsigned long num_sectors;
2384
2385         while (!list_empty(&sctx->csum_list)) {
2386                 sum = list_first_entry(&sctx->csum_list,
2387                                        struct btrfs_ordered_sum, list);
2388                 if (sum->bytenr > logical)
2389                         return 0;
2390                 if (sum->bytenr + sum->len > logical)
2391                         break;
2392
2393                 ++sctx->stat.csum_discards;
2394                 list_del(&sum->list);
2395                 kfree(sum);
2396                 sum = NULL;
2397         }
2398         if (!sum)
2399                 return 0;
2400
2401         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2402         num_sectors = sum->len / sctx->sectorsize;
2403         memcpy(csum, sum->sums + index, sctx->csum_size);
2404         if (index == num_sectors - 1) {
2405                 list_del(&sum->list);
2406                 kfree(sum);
2407         }
2408         return 1;
2409 }
2410
2411 /* scrub extent tries to collect up to 64 kB for each bio */
2412 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2413                         u64 physical, struct btrfs_device *dev, u64 flags,
2414                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2415 {
2416         int ret;
2417         u8 csum[BTRFS_CSUM_SIZE];
2418         u32 blocksize;
2419
2420         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2421                 blocksize = sctx->sectorsize;
2422                 spin_lock(&sctx->stat_lock);
2423                 sctx->stat.data_extents_scrubbed++;
2424                 sctx->stat.data_bytes_scrubbed += len;
2425                 spin_unlock(&sctx->stat_lock);
2426         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2427                 blocksize = sctx->nodesize;
2428                 spin_lock(&sctx->stat_lock);
2429                 sctx->stat.tree_extents_scrubbed++;
2430                 sctx->stat.tree_bytes_scrubbed += len;
2431                 spin_unlock(&sctx->stat_lock);
2432         } else {
2433                 blocksize = sctx->sectorsize;
2434                 WARN_ON(1);
2435         }
2436
2437         while (len) {
2438                 u64 l = min_t(u64, len, blocksize);
2439                 int have_csum = 0;
2440
2441                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2442                         /* push csums to sbio */
2443                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2444                         if (have_csum == 0)
2445                                 ++sctx->stat.no_csum;
2446                         if (sctx->is_dev_replace && !have_csum) {
2447                                 ret = copy_nocow_pages(sctx, logical, l,
2448                                                        mirror_num,
2449                                                       physical_for_dev_replace);
2450                                 goto behind_scrub_pages;
2451                         }
2452                 }
2453                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2454                                   mirror_num, have_csum ? csum : NULL, 0,
2455                                   physical_for_dev_replace);
2456 behind_scrub_pages:
2457                 if (ret)
2458                         return ret;
2459                 len -= l;
2460                 logical += l;
2461                 physical += l;
2462                 physical_for_dev_replace += l;
2463         }
2464         return 0;
2465 }
2466
2467 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2468                                   u64 logical, u64 len,
2469                                   u64 physical, struct btrfs_device *dev,
2470                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2471 {
2472         struct scrub_ctx *sctx = sparity->sctx;
2473         struct scrub_block *sblock;
2474         int index;
2475
2476         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2477         if (!sblock) {
2478                 spin_lock(&sctx->stat_lock);
2479                 sctx->stat.malloc_errors++;
2480                 spin_unlock(&sctx->stat_lock);
2481                 return -ENOMEM;
2482         }
2483
2484         /* one ref inside this function, plus one for each page added to
2485          * a bio later on */
2486         atomic_set(&sblock->refs, 1);
2487         sblock->sctx = sctx;
2488         sblock->no_io_error_seen = 1;
2489         sblock->sparity = sparity;
2490         scrub_parity_get(sparity);
2491
2492         for (index = 0; len > 0; index++) {
2493                 struct scrub_page *spage;
2494                 u64 l = min_t(u64, len, PAGE_SIZE);
2495
2496                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2497                 if (!spage) {
2498 leave_nomem:
2499                         spin_lock(&sctx->stat_lock);
2500                         sctx->stat.malloc_errors++;
2501                         spin_unlock(&sctx->stat_lock);
2502                         scrub_block_put(sblock);
2503                         return -ENOMEM;
2504                 }
2505                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2506                 /* For scrub block */
2507                 scrub_page_get(spage);
2508                 sblock->pagev[index] = spage;
2509                 /* For scrub parity */
2510                 scrub_page_get(spage);
2511                 list_add_tail(&spage->list, &sparity->spages);
2512                 spage->sblock = sblock;
2513                 spage->dev = dev;
2514                 spage->flags = flags;
2515                 spage->generation = gen;
2516                 spage->logical = logical;
2517                 spage->physical = physical;
2518                 spage->mirror_num = mirror_num;
2519                 if (csum) {
2520                         spage->have_csum = 1;
2521                         memcpy(spage->csum, csum, sctx->csum_size);
2522                 } else {
2523                         spage->have_csum = 0;
2524                 }
2525                 sblock->page_count++;
2526                 spage->page = alloc_page(GFP_NOFS);
2527                 if (!spage->page)
2528                         goto leave_nomem;
2529                 len -= l;
2530                 logical += l;
2531                 physical += l;
2532         }
2533
2534         WARN_ON(sblock->page_count == 0);
2535         for (index = 0; index < sblock->page_count; index++) {
2536                 struct scrub_page *spage = sblock->pagev[index];
2537                 int ret;
2538
2539                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2540                 if (ret) {
2541                         scrub_block_put(sblock);
2542                         return ret;
2543                 }
2544         }
2545
2546         /* last one frees, either here or in bio completion for last page */
2547         scrub_block_put(sblock);
2548         return 0;
2549 }
2550
2551 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2552                                    u64 logical, u64 len,
2553                                    u64 physical, struct btrfs_device *dev,
2554                                    u64 flags, u64 gen, int mirror_num)
2555 {
2556         struct scrub_ctx *sctx = sparity->sctx;
2557         int ret;
2558         u8 csum[BTRFS_CSUM_SIZE];
2559         u32 blocksize;
2560
2561         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2562                 blocksize = sctx->sectorsize;
2563         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2564                 blocksize = sctx->nodesize;
2565         } else {
2566                 blocksize = sctx->sectorsize;
2567                 WARN_ON(1);
2568         }
2569
2570         while (len) {
2571                 u64 l = min_t(u64, len, blocksize);
2572                 int have_csum = 0;
2573
2574                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2575                         /* push csums to sbio */
2576                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2577                         if (have_csum == 0)
2578                                 goto skip;
2579                 }
2580                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2581                                              flags, gen, mirror_num,
2582                                              have_csum ? csum : NULL);
2583                 if (ret)
2584                         return ret;
2585 skip:
2586                 len -= l;
2587                 logical += l;
2588                 physical += l;
2589         }
2590         return 0;
2591 }
2592
2593 /*
2594  * Given a physical address, this will calculate it's
2595  * logical offset. if this is a parity stripe, it will return
2596  * the most left data stripe's logical offset.
2597  *
2598  * return 0 if it is a data stripe, 1 means parity stripe.
2599  */
2600 static int get_raid56_logic_offset(u64 physical, int num,
2601                                    struct map_lookup *map, u64 *offset,
2602                                    u64 *stripe_start)
2603 {
2604         int i;
2605         int j = 0;
2606         u64 stripe_nr;
2607         u64 last_offset;
2608         int stripe_index;
2609         int rot;
2610
2611         last_offset = (physical - map->stripes[num].physical) *
2612                       nr_data_stripes(map);
2613         if (stripe_start)
2614                 *stripe_start = last_offset;
2615
2616         *offset = last_offset;
2617         for (i = 0; i < nr_data_stripes(map); i++) {
2618                 *offset = last_offset + i * map->stripe_len;
2619
2620                 stripe_nr = *offset;
2621                 do_div(stripe_nr, map->stripe_len);
2622                 do_div(stripe_nr, nr_data_stripes(map));
2623
2624                 /* Work out the disk rotation on this stripe-set */
2625                 rot = do_div(stripe_nr, map->num_stripes);
2626                 /* calculate which stripe this data locates */
2627                 rot += i;
2628                 stripe_index = rot % map->num_stripes;
2629                 if (stripe_index == num)
2630                         return 0;
2631                 if (stripe_index < num)
2632                         j++;
2633         }
2634         *offset = last_offset + j * map->stripe_len;
2635         return 1;
2636 }
2637
2638 static void scrub_free_parity(struct scrub_parity *sparity)
2639 {
2640         struct scrub_ctx *sctx = sparity->sctx;
2641         struct scrub_page *curr, *next;
2642         int nbits;
2643
2644         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2645         if (nbits) {
2646                 spin_lock(&sctx->stat_lock);
2647                 sctx->stat.read_errors += nbits;
2648                 sctx->stat.uncorrectable_errors += nbits;
2649                 spin_unlock(&sctx->stat_lock);
2650         }
2651
2652         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2653                 list_del_init(&curr->list);
2654                 scrub_page_put(curr);
2655         }
2656
2657         kfree(sparity);
2658 }
2659
2660 static void scrub_parity_bio_endio(struct bio *bio, int error)
2661 {
2662         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2663         struct scrub_ctx *sctx = sparity->sctx;
2664
2665         if (error)
2666                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2667                           sparity->nsectors);
2668
2669         scrub_free_parity(sparity);
2670         scrub_pending_bio_dec(sctx);
2671         bio_put(bio);
2672 }
2673
2674 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2675 {
2676         struct scrub_ctx *sctx = sparity->sctx;
2677         struct bio *bio;
2678         struct btrfs_raid_bio *rbio;
2679         struct scrub_page *spage;
2680         struct btrfs_bio *bbio = NULL;
2681         u64 length;
2682         int ret;
2683
2684         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2685                            sparity->nsectors))
2686                 goto out;
2687
2688         length = sparity->logic_end - sparity->logic_start + 1;
2689         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2690                                sparity->logic_start,
2691                                &length, &bbio, 0, 1);
2692         if (ret || !bbio || !bbio->raid_map)
2693                 goto bbio_out;
2694
2695         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2696         if (!bio)
2697                 goto bbio_out;
2698
2699         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2700         bio->bi_private = sparity;
2701         bio->bi_end_io = scrub_parity_bio_endio;
2702
2703         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2704                                               length, sparity->scrub_dev,
2705                                               sparity->dbitmap,
2706                                               sparity->nsectors);
2707         if (!rbio)
2708                 goto rbio_out;
2709
2710         list_for_each_entry(spage, &sparity->spages, list)
2711                 raid56_parity_add_scrub_pages(rbio, spage->page,
2712                                               spage->logical);
2713
2714         scrub_pending_bio_inc(sctx);
2715         raid56_parity_submit_scrub_rbio(rbio);
2716         return;
2717
2718 rbio_out:
2719         bio_put(bio);
2720 bbio_out:
2721         btrfs_put_bbio(bbio);
2722         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2723                   sparity->nsectors);
2724         spin_lock(&sctx->stat_lock);
2725         sctx->stat.malloc_errors++;
2726         spin_unlock(&sctx->stat_lock);
2727 out:
2728         scrub_free_parity(sparity);
2729 }
2730
2731 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2732 {
2733         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2734 }
2735
2736 static void scrub_parity_get(struct scrub_parity *sparity)
2737 {
2738         atomic_inc(&sparity->refs);
2739 }
2740
2741 static void scrub_parity_put(struct scrub_parity *sparity)
2742 {
2743         if (!atomic_dec_and_test(&sparity->refs))
2744                 return;
2745
2746         scrub_parity_check_and_repair(sparity);
2747 }
2748
2749 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2750                                                   struct map_lookup *map,
2751                                                   struct btrfs_device *sdev,
2752                                                   struct btrfs_path *path,
2753                                                   u64 logic_start,
2754                                                   u64 logic_end)
2755 {
2756         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2757         struct btrfs_root *root = fs_info->extent_root;
2758         struct btrfs_root *csum_root = fs_info->csum_root;
2759         struct btrfs_extent_item *extent;
2760         u64 flags;
2761         int ret;
2762         int slot;
2763         struct extent_buffer *l;
2764         struct btrfs_key key;
2765         u64 generation;
2766         u64 extent_logical;
2767         u64 extent_physical;
2768         u64 extent_len;
2769         struct btrfs_device *extent_dev;
2770         struct scrub_parity *sparity;
2771         int nsectors;
2772         int bitmap_len;
2773         int extent_mirror_num;
2774         int stop_loop = 0;
2775
2776         nsectors = map->stripe_len / root->sectorsize;
2777         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2778         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2779                           GFP_NOFS);
2780         if (!sparity) {
2781                 spin_lock(&sctx->stat_lock);
2782                 sctx->stat.malloc_errors++;
2783                 spin_unlock(&sctx->stat_lock);
2784                 return -ENOMEM;
2785         }
2786
2787         sparity->stripe_len = map->stripe_len;
2788         sparity->nsectors = nsectors;
2789         sparity->sctx = sctx;
2790         sparity->scrub_dev = sdev;
2791         sparity->logic_start = logic_start;
2792         sparity->logic_end = logic_end;
2793         atomic_set(&sparity->refs, 1);
2794         INIT_LIST_HEAD(&sparity->spages);
2795         sparity->dbitmap = sparity->bitmap;
2796         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2797
2798         ret = 0;
2799         while (logic_start < logic_end) {
2800                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2801                         key.type = BTRFS_METADATA_ITEM_KEY;
2802                 else
2803                         key.type = BTRFS_EXTENT_ITEM_KEY;
2804                 key.objectid = logic_start;
2805                 key.offset = (u64)-1;
2806
2807                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2808                 if (ret < 0)
2809                         goto out;
2810
2811                 if (ret > 0) {
2812                         ret = btrfs_previous_extent_item(root, path, 0);
2813                         if (ret < 0)
2814                                 goto out;
2815                         if (ret > 0) {
2816                                 btrfs_release_path(path);
2817                                 ret = btrfs_search_slot(NULL, root, &key,
2818                                                         path, 0, 0);
2819                                 if (ret < 0)
2820                                         goto out;
2821                         }
2822                 }
2823
2824                 stop_loop = 0;
2825                 while (1) {
2826                         u64 bytes;
2827
2828                         l = path->nodes[0];
2829                         slot = path->slots[0];
2830                         if (slot >= btrfs_header_nritems(l)) {
2831                                 ret = btrfs_next_leaf(root, path);
2832                                 if (ret == 0)
2833                                         continue;
2834                                 if (ret < 0)
2835                                         goto out;
2836
2837                                 stop_loop = 1;
2838                                 break;
2839                         }
2840                         btrfs_item_key_to_cpu(l, &key, slot);
2841
2842                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2843                                 bytes = root->nodesize;
2844                         else
2845                                 bytes = key.offset;
2846
2847                         if (key.objectid + bytes <= logic_start)
2848                                 goto next;
2849
2850                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2851                             key.type != BTRFS_METADATA_ITEM_KEY)
2852                                 goto next;
2853
2854                         if (key.objectid > logic_end) {
2855                                 stop_loop = 1;
2856                                 break;
2857                         }
2858
2859                         while (key.objectid >= logic_start + map->stripe_len)
2860                                 logic_start += map->stripe_len;
2861
2862                         extent = btrfs_item_ptr(l, slot,
2863                                                 struct btrfs_extent_item);
2864                         flags = btrfs_extent_flags(l, extent);
2865                         generation = btrfs_extent_generation(l, extent);
2866
2867                         if (key.objectid < logic_start &&
2868                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2869                                 btrfs_err(fs_info,
2870                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2871                                            key.objectid, logic_start);
2872                                 goto next;
2873                         }
2874 again:
2875                         extent_logical = key.objectid;
2876                         extent_len = bytes;
2877
2878                         if (extent_logical < logic_start) {
2879                                 extent_len -= logic_start - extent_logical;
2880                                 extent_logical = logic_start;
2881                         }
2882
2883                         if (extent_logical + extent_len >
2884                             logic_start + map->stripe_len)
2885                                 extent_len = logic_start + map->stripe_len -
2886                                              extent_logical;
2887
2888                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2889                                                        extent_len);
2890
2891                         scrub_remap_extent(fs_info, extent_logical,
2892                                            extent_len, &extent_physical,
2893                                            &extent_dev,
2894                                            &extent_mirror_num);
2895
2896                         ret = btrfs_lookup_csums_range(csum_root,
2897                                                 extent_logical,
2898                                                 extent_logical + extent_len - 1,
2899                                                 &sctx->csum_list, 1);
2900                         if (ret)
2901                                 goto out;
2902
2903                         ret = scrub_extent_for_parity(sparity, extent_logical,
2904                                                       extent_len,
2905                                                       extent_physical,
2906                                                       extent_dev, flags,
2907                                                       generation,
2908                                                       extent_mirror_num);
2909                         if (ret)
2910                                 goto out;
2911
2912                         scrub_free_csums(sctx);
2913                         if (extent_logical + extent_len <
2914                             key.objectid + bytes) {
2915                                 logic_start += map->stripe_len;
2916
2917                                 if (logic_start >= logic_end) {
2918                                         stop_loop = 1;
2919                                         break;
2920                                 }
2921
2922                                 if (logic_start < key.objectid + bytes) {
2923                                         cond_resched();
2924                                         goto again;
2925                                 }
2926                         }
2927 next:
2928                         path->slots[0]++;
2929                 }
2930
2931                 btrfs_release_path(path);
2932
2933                 if (stop_loop)
2934                         break;
2935
2936                 logic_start += map->stripe_len;
2937         }
2938 out:
2939         if (ret < 0)
2940                 scrub_parity_mark_sectors_error(sparity, logic_start,
2941                                                 logic_end - logic_start + 1);
2942         scrub_parity_put(sparity);
2943         scrub_submit(sctx);
2944         mutex_lock(&sctx->wr_ctx.wr_lock);
2945         scrub_wr_submit(sctx);
2946         mutex_unlock(&sctx->wr_ctx.wr_lock);
2947
2948         btrfs_release_path(path);
2949         return ret < 0 ? ret : 0;
2950 }
2951
2952 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2953                                            struct map_lookup *map,
2954                                            struct btrfs_device *scrub_dev,
2955                                            int num, u64 base, u64 length,
2956                                            int is_dev_replace)
2957 {
2958         struct btrfs_path *path, *ppath;
2959         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2960         struct btrfs_root *root = fs_info->extent_root;
2961         struct btrfs_root *csum_root = fs_info->csum_root;
2962         struct btrfs_extent_item *extent;
2963         struct blk_plug plug;
2964         u64 flags;
2965         int ret;
2966         int slot;
2967         u64 nstripes;
2968         struct extent_buffer *l;
2969         struct btrfs_key key;
2970         u64 physical;
2971         u64 logical;
2972         u64 logic_end;
2973         u64 physical_end;
2974         u64 generation;
2975         int mirror_num;
2976         struct reada_control *reada1;
2977         struct reada_control *reada2;
2978         struct btrfs_key key_start;
2979         struct btrfs_key key_end;
2980         u64 increment = map->stripe_len;
2981         u64 offset;
2982         u64 extent_logical;
2983         u64 extent_physical;
2984         u64 extent_len;
2985         u64 stripe_logical;
2986         u64 stripe_end;
2987         struct btrfs_device *extent_dev;
2988         int extent_mirror_num;
2989         int stop_loop = 0;
2990
2991         nstripes = length;
2992         physical = map->stripes[num].physical;
2993         offset = 0;
2994         do_div(nstripes, map->stripe_len);
2995         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2996                 offset = map->stripe_len * num;
2997                 increment = map->stripe_len * map->num_stripes;
2998                 mirror_num = 1;
2999         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3000                 int factor = map->num_stripes / map->sub_stripes;
3001                 offset = map->stripe_len * (num / map->sub_stripes);
3002                 increment = map->stripe_len * factor;
3003                 mirror_num = num % map->sub_stripes + 1;
3004         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3005                 increment = map->stripe_len;
3006                 mirror_num = num % map->num_stripes + 1;
3007         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3008                 increment = map->stripe_len;
3009                 mirror_num = num % map->num_stripes + 1;
3010         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3011                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3012                 increment = map->stripe_len * nr_data_stripes(map);
3013                 mirror_num = 1;
3014         } else {
3015                 increment = map->stripe_len;
3016                 mirror_num = 1;
3017         }
3018
3019         path = btrfs_alloc_path();
3020         if (!path)
3021                 return -ENOMEM;
3022
3023         ppath = btrfs_alloc_path();
3024         if (!ppath) {
3025                 btrfs_free_path(ppath);
3026                 return -ENOMEM;
3027         }
3028
3029         /*
3030          * work on commit root. The related disk blocks are static as
3031          * long as COW is applied. This means, it is save to rewrite
3032          * them to repair disk errors without any race conditions
3033          */
3034         path->search_commit_root = 1;
3035         path->skip_locking = 1;
3036
3037         /*
3038          * trigger the readahead for extent tree csum tree and wait for
3039          * completion. During readahead, the scrub is officially paused
3040          * to not hold off transaction commits
3041          */
3042         logical = base + offset;
3043         physical_end = physical + nstripes * map->stripe_len;
3044         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3045                 get_raid56_logic_offset(physical_end, num,
3046                                         map, &logic_end, NULL);
3047                 logic_end += base;
3048         } else {
3049                 logic_end = logical + increment * nstripes;
3050         }
3051         wait_event(sctx->list_wait,
3052                    atomic_read(&sctx->bios_in_flight) == 0);
3053         scrub_blocked_if_needed(fs_info);
3054
3055         /* FIXME it might be better to start readahead at commit root */
3056         key_start.objectid = logical;
3057         key_start.type = BTRFS_EXTENT_ITEM_KEY;
3058         key_start.offset = (u64)0;
3059         key_end.objectid = logic_end;
3060         key_end.type = BTRFS_METADATA_ITEM_KEY;
3061         key_end.offset = (u64)-1;
3062         reada1 = btrfs_reada_add(root, &key_start, &key_end);
3063
3064         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3065         key_start.type = BTRFS_EXTENT_CSUM_KEY;
3066         key_start.offset = logical;
3067         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3068         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3069         key_end.offset = logic_end;
3070         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3071
3072         if (!IS_ERR(reada1))
3073                 btrfs_reada_wait(reada1);
3074         if (!IS_ERR(reada2))
3075                 btrfs_reada_wait(reada2);
3076
3077
3078         /*
3079          * collect all data csums for the stripe to avoid seeking during
3080          * the scrub. This might currently (crc32) end up to be about 1MB
3081          */
3082         blk_start_plug(&plug);
3083
3084         /*
3085          * now find all extents for each stripe and scrub them
3086          */
3087         ret = 0;
3088         while (physical < physical_end) {
3089                 /* for raid56, we skip parity stripe */
3090                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3091                         ret = get_raid56_logic_offset(physical, num,
3092                                         map, &logical, &stripe_logical);
3093                         logical += base;
3094                         if (ret) {
3095                                 stripe_logical += base;
3096                                 stripe_end = stripe_logical + increment - 1;
3097                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3098                                                 ppath, stripe_logical,
3099                                                 stripe_end);
3100                                 if (ret)
3101                                         goto out;
3102                                 goto skip;
3103                         }
3104                 }
3105                 /*
3106                  * canceled?
3107                  */
3108                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3109                     atomic_read(&sctx->cancel_req)) {
3110                         ret = -ECANCELED;
3111                         goto out;
3112                 }
3113                 /*
3114                  * check to see if we have to pause
3115                  */
3116                 if (atomic_read(&fs_info->scrub_pause_req)) {
3117                         /* push queued extents */
3118                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3119                         scrub_submit(sctx);
3120                         mutex_lock(&sctx->wr_ctx.wr_lock);
3121                         scrub_wr_submit(sctx);
3122                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3123                         wait_event(sctx->list_wait,
3124                                    atomic_read(&sctx->bios_in_flight) == 0);
3125                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3126                         scrub_blocked_if_needed(fs_info);
3127                 }
3128
3129                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3130                         key.type = BTRFS_METADATA_ITEM_KEY;
3131                 else
3132                         key.type = BTRFS_EXTENT_ITEM_KEY;
3133                 key.objectid = logical;
3134                 key.offset = (u64)-1;
3135
3136                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3137                 if (ret < 0)
3138                         goto out;
3139
3140                 if (ret > 0) {
3141                         ret = btrfs_previous_extent_item(root, path, 0);
3142                         if (ret < 0)
3143                                 goto out;
3144                         if (ret > 0) {
3145                                 /* there's no smaller item, so stick with the
3146                                  * larger one */
3147                                 btrfs_release_path(path);
3148                                 ret = btrfs_search_slot(NULL, root, &key,
3149                                                         path, 0, 0);
3150                                 if (ret < 0)
3151                                         goto out;
3152                         }
3153                 }
3154
3155                 stop_loop = 0;
3156                 while (1) {
3157                         u64 bytes;
3158
3159                         l = path->nodes[0];
3160                         slot = path->slots[0];
3161                         if (slot >= btrfs_header_nritems(l)) {
3162                                 ret = btrfs_next_leaf(root, path);
3163                                 if (ret == 0)
3164                                         continue;
3165                                 if (ret < 0)
3166                                         goto out;
3167
3168                                 stop_loop = 1;
3169                                 break;
3170                         }
3171                         btrfs_item_key_to_cpu(l, &key, slot);
3172
3173                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3174                                 bytes = root->nodesize;
3175                         else
3176                                 bytes = key.offset;
3177
3178                         if (key.objectid + bytes <= logical)
3179                                 goto next;
3180
3181                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3182                             key.type != BTRFS_METADATA_ITEM_KEY)
3183                                 goto next;
3184
3185                         if (key.objectid >= logical + map->stripe_len) {
3186                                 /* out of this device extent */
3187                                 if (key.objectid >= logic_end)
3188                                         stop_loop = 1;
3189                                 break;
3190                         }
3191
3192                         extent = btrfs_item_ptr(l, slot,
3193                                                 struct btrfs_extent_item);
3194                         flags = btrfs_extent_flags(l, extent);
3195                         generation = btrfs_extent_generation(l, extent);
3196
3197                         if (key.objectid < logical &&
3198                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3199                                 btrfs_err(fs_info,
3200                                            "scrub: tree block %llu spanning "
3201                                            "stripes, ignored. logical=%llu",
3202                                        key.objectid, logical);
3203                                 goto next;
3204                         }
3205
3206 again:
3207                         extent_logical = key.objectid;
3208                         extent_len = bytes;
3209
3210                         /*
3211                          * trim extent to this stripe
3212                          */
3213                         if (extent_logical < logical) {
3214                                 extent_len -= logical - extent_logical;
3215                                 extent_logical = logical;
3216                         }
3217                         if (extent_logical + extent_len >
3218                             logical + map->stripe_len) {
3219                                 extent_len = logical + map->stripe_len -
3220                                              extent_logical;
3221                         }
3222
3223                         extent_physical = extent_logical - logical + physical;
3224                         extent_dev = scrub_dev;
3225                         extent_mirror_num = mirror_num;
3226                         if (is_dev_replace)
3227                                 scrub_remap_extent(fs_info, extent_logical,
3228                                                    extent_len, &extent_physical,
3229                                                    &extent_dev,
3230                                                    &extent_mirror_num);
3231
3232                         ret = btrfs_lookup_csums_range(csum_root, logical,
3233                                                 logical + map->stripe_len - 1,
3234                                                 &sctx->csum_list, 1);
3235                         if (ret)
3236                                 goto out;
3237
3238                         ret = scrub_extent(sctx, extent_logical, extent_len,
3239                                            extent_physical, extent_dev, flags,
3240                                            generation, extent_mirror_num,
3241                                            extent_logical - logical + physical);
3242                         if (ret)
3243                                 goto out;
3244
3245                         scrub_free_csums(sctx);
3246                         if (extent_logical + extent_len <
3247                             key.objectid + bytes) {
3248                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3249                                         /*
3250                                          * loop until we find next data stripe
3251                                          * or we have finished all stripes.
3252                                          */
3253 loop:
3254                                         physical += map->stripe_len;