8af7372238fc13393772000189b9a083ba8f9f7e
[muen/linux.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31 #include "raid56.h"
32
33 /*
34  * This is only the first step towards a full-features scrub. It reads all
35  * extent and super block and verifies the checksums. In case a bad checksum
36  * is found or the extent cannot be read, good data will be written back if
37  * any can be found.
38  *
39  * Future enhancements:
40  *  - In case an unrepairable extent is encountered, track which files are
41  *    affected and report them
42  *  - track and record media errors, throw out bad devices
43  *  - add a mode to also read unallocated space
44  */
45
46 struct scrub_block;
47 struct scrub_ctx;
48
49 /*
50  * the following three values only influence the performance.
51  * The last one configures the number of parallel and outstanding I/O
52  * operations. The first two values configure an upper limit for the number
53  * of (dynamically allocated) pages that are added to a bio.
54  */
55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
58
59 /*
60  * the following value times PAGE_SIZE needs to be large enough to match the
61  * largest node/leaf/sector size that shall be supported.
62  * Values larger than BTRFS_STRIPE_LEN are not supported.
63  */
64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
65
66 struct scrub_recover {
67         atomic_t                refs;
68         struct btrfs_bio        *bbio;
69         u64                     map_length;
70 };
71
72 struct scrub_page {
73         struct scrub_block      *sblock;
74         struct page             *page;
75         struct btrfs_device     *dev;
76         struct list_head        list;
77         u64                     flags;  /* extent flags */
78         u64                     generation;
79         u64                     logical;
80         u64                     physical;
81         u64                     physical_for_dev_replace;
82         atomic_t                refs;
83         struct {
84                 unsigned int    mirror_num:8;
85                 unsigned int    have_csum:1;
86                 unsigned int    io_error:1;
87         };
88         u8                      csum[BTRFS_CSUM_SIZE];
89
90         struct scrub_recover    *recover;
91 };
92
93 struct scrub_bio {
94         int                     index;
95         struct scrub_ctx        *sctx;
96         struct btrfs_device     *dev;
97         struct bio              *bio;
98         int                     err;
99         u64                     logical;
100         u64                     physical;
101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
103 #else
104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
105 #endif
106         int                     page_count;
107         int                     next_free;
108         struct btrfs_work       work;
109 };
110
111 struct scrub_block {
112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
113         int                     page_count;
114         atomic_t                outstanding_pages;
115         atomic_t                refs; /* free mem on transition to zero */
116         struct scrub_ctx        *sctx;
117         struct scrub_parity     *sparity;
118         struct {
119                 unsigned int    header_error:1;
120                 unsigned int    checksum_error:1;
121                 unsigned int    no_io_error_seen:1;
122                 unsigned int    generation_error:1; /* also sets header_error */
123
124                 /* The following is for the data used to check parity */
125                 /* It is for the data with checksum */
126                 unsigned int    data_corrected:1;
127         };
128 };
129
130 /* Used for the chunks with parity stripe such RAID5/6 */
131 struct scrub_parity {
132         struct scrub_ctx        *sctx;
133
134         struct btrfs_device     *scrub_dev;
135
136         u64                     logic_start;
137
138         u64                     logic_end;
139
140         int                     nsectors;
141
142         int                     stripe_len;
143
144         atomic_t                refs;
145
146         struct list_head        spages;
147
148         /* Work of parity check and repair */
149         struct btrfs_work       work;
150
151         /* Mark the parity blocks which have data */
152         unsigned long           *dbitmap;
153
154         /*
155          * Mark the parity blocks which have data, but errors happen when
156          * read data or check data
157          */
158         unsigned long           *ebitmap;
159
160         unsigned long           bitmap[0];
161 };
162
163 struct scrub_wr_ctx {
164         struct scrub_bio *wr_curr_bio;
165         struct btrfs_device *tgtdev;
166         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
167         atomic_t flush_all_writes;
168         struct mutex wr_lock;
169 };
170
171 struct scrub_ctx {
172         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
173         struct btrfs_root       *dev_root;
174         int                     first_free;
175         int                     curr;
176         atomic_t                bios_in_flight;
177         atomic_t                workers_pending;
178         spinlock_t              list_lock;
179         wait_queue_head_t       list_wait;
180         u16                     csum_size;
181         struct list_head        csum_list;
182         atomic_t                cancel_req;
183         int                     readonly;
184         int                     pages_per_rd_bio;
185         u32                     sectorsize;
186         u32                     nodesize;
187
188         int                     is_dev_replace;
189         struct scrub_wr_ctx     wr_ctx;
190
191         /*
192          * statistics
193          */
194         struct btrfs_scrub_progress stat;
195         spinlock_t              stat_lock;
196 };
197
198 struct scrub_fixup_nodatasum {
199         struct scrub_ctx        *sctx;
200         struct btrfs_device     *dev;
201         u64                     logical;
202         struct btrfs_root       *root;
203         struct btrfs_work       work;
204         int                     mirror_num;
205 };
206
207 struct scrub_nocow_inode {
208         u64                     inum;
209         u64                     offset;
210         u64                     root;
211         struct list_head        list;
212 };
213
214 struct scrub_copy_nocow_ctx {
215         struct scrub_ctx        *sctx;
216         u64                     logical;
217         u64                     len;
218         int                     mirror_num;
219         u64                     physical_for_dev_replace;
220         struct list_head        inodes;
221         struct btrfs_work       work;
222 };
223
224 struct scrub_warning {
225         struct btrfs_path       *path;
226         u64                     extent_item_size;
227         const char              *errstr;
228         sector_t                sector;
229         u64                     logical;
230         struct btrfs_device     *dev;
231 };
232
233 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
234 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
235 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
236 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
237 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
238 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
239                                      struct scrub_block *sblocks_for_recheck);
240 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
241                                 struct scrub_block *sblock, int is_metadata,
242                                 int have_csum, u8 *csum, u64 generation,
243                                 u16 csum_size, int retry_failed_mirror);
244 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
245                                          struct scrub_block *sblock,
246                                          int is_metadata, int have_csum,
247                                          const u8 *csum, u64 generation,
248                                          u16 csum_size);
249 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
250                                              struct scrub_block *sblock_good);
251 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
252                                             struct scrub_block *sblock_good,
253                                             int page_num, int force_write);
254 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
255 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
256                                            int page_num);
257 static int scrub_checksum_data(struct scrub_block *sblock);
258 static int scrub_checksum_tree_block(struct scrub_block *sblock);
259 static int scrub_checksum_super(struct scrub_block *sblock);
260 static void scrub_block_get(struct scrub_block *sblock);
261 static void scrub_block_put(struct scrub_block *sblock);
262 static void scrub_page_get(struct scrub_page *spage);
263 static void scrub_page_put(struct scrub_page *spage);
264 static void scrub_parity_get(struct scrub_parity *sparity);
265 static void scrub_parity_put(struct scrub_parity *sparity);
266 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
267                                     struct scrub_page *spage);
268 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
269                        u64 physical, struct btrfs_device *dev, u64 flags,
270                        u64 gen, int mirror_num, u8 *csum, int force,
271                        u64 physical_for_dev_replace);
272 static void scrub_bio_end_io(struct bio *bio, int err);
273 static void scrub_bio_end_io_worker(struct btrfs_work *work);
274 static void scrub_block_complete(struct scrub_block *sblock);
275 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
276                                u64 extent_logical, u64 extent_len,
277                                u64 *extent_physical,
278                                struct btrfs_device **extent_dev,
279                                int *extent_mirror_num);
280 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
281                               struct scrub_wr_ctx *wr_ctx,
282                               struct btrfs_fs_info *fs_info,
283                               struct btrfs_device *dev,
284                               int is_dev_replace);
285 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
286 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
287                                     struct scrub_page *spage);
288 static void scrub_wr_submit(struct scrub_ctx *sctx);
289 static void scrub_wr_bio_end_io(struct bio *bio, int err);
290 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
291 static int write_page_nocow(struct scrub_ctx *sctx,
292                             u64 physical_for_dev_replace, struct page *page);
293 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
294                                       struct scrub_copy_nocow_ctx *ctx);
295 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
296                             int mirror_num, u64 physical_for_dev_replace);
297 static void copy_nocow_pages_worker(struct btrfs_work *work);
298 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
299 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
300
301
302 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
303 {
304         atomic_inc(&sctx->bios_in_flight);
305 }
306
307 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
308 {
309         atomic_dec(&sctx->bios_in_flight);
310         wake_up(&sctx->list_wait);
311 }
312
313 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
314 {
315         while (atomic_read(&fs_info->scrub_pause_req)) {
316                 mutex_unlock(&fs_info->scrub_lock);
317                 wait_event(fs_info->scrub_pause_wait,
318                    atomic_read(&fs_info->scrub_pause_req) == 0);
319                 mutex_lock(&fs_info->scrub_lock);
320         }
321 }
322
323 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
324 {
325         atomic_inc(&fs_info->scrubs_paused);
326         wake_up(&fs_info->scrub_pause_wait);
327
328         mutex_lock(&fs_info->scrub_lock);
329         __scrub_blocked_if_needed(fs_info);
330         atomic_dec(&fs_info->scrubs_paused);
331         mutex_unlock(&fs_info->scrub_lock);
332
333         wake_up(&fs_info->scrub_pause_wait);
334 }
335
336 /*
337  * used for workers that require transaction commits (i.e., for the
338  * NOCOW case)
339  */
340 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
341 {
342         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
343
344         /*
345          * increment scrubs_running to prevent cancel requests from
346          * completing as long as a worker is running. we must also
347          * increment scrubs_paused to prevent deadlocking on pause
348          * requests used for transactions commits (as the worker uses a
349          * transaction context). it is safe to regard the worker
350          * as paused for all matters practical. effectively, we only
351          * avoid cancellation requests from completing.
352          */
353         mutex_lock(&fs_info->scrub_lock);
354         atomic_inc(&fs_info->scrubs_running);
355         atomic_inc(&fs_info->scrubs_paused);
356         mutex_unlock(&fs_info->scrub_lock);
357
358         /*
359          * check if @scrubs_running=@scrubs_paused condition
360          * inside wait_event() is not an atomic operation.
361          * which means we may inc/dec @scrub_running/paused
362          * at any time. Let's wake up @scrub_pause_wait as
363          * much as we can to let commit transaction blocked less.
364          */
365         wake_up(&fs_info->scrub_pause_wait);
366
367         atomic_inc(&sctx->workers_pending);
368 }
369
370 /* used for workers that require transaction commits */
371 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
372 {
373         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
374
375         /*
376          * see scrub_pending_trans_workers_inc() why we're pretending
377          * to be paused in the scrub counters
378          */
379         mutex_lock(&fs_info->scrub_lock);
380         atomic_dec(&fs_info->scrubs_running);
381         atomic_dec(&fs_info->scrubs_paused);
382         mutex_unlock(&fs_info->scrub_lock);
383         atomic_dec(&sctx->workers_pending);
384         wake_up(&fs_info->scrub_pause_wait);
385         wake_up(&sctx->list_wait);
386 }
387
388 static void scrub_free_csums(struct scrub_ctx *sctx)
389 {
390         while (!list_empty(&sctx->csum_list)) {
391                 struct btrfs_ordered_sum *sum;
392                 sum = list_first_entry(&sctx->csum_list,
393                                        struct btrfs_ordered_sum, list);
394                 list_del(&sum->list);
395                 kfree(sum);
396         }
397 }
398
399 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
400 {
401         int i;
402
403         if (!sctx)
404                 return;
405
406         scrub_free_wr_ctx(&sctx->wr_ctx);
407
408         /* this can happen when scrub is cancelled */
409         if (sctx->curr != -1) {
410                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
411
412                 for (i = 0; i < sbio->page_count; i++) {
413                         WARN_ON(!sbio->pagev[i]->page);
414                         scrub_block_put(sbio->pagev[i]->sblock);
415                 }
416                 bio_put(sbio->bio);
417         }
418
419         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
420                 struct scrub_bio *sbio = sctx->bios[i];
421
422                 if (!sbio)
423                         break;
424                 kfree(sbio);
425         }
426
427         scrub_free_csums(sctx);
428         kfree(sctx);
429 }
430
431 static noinline_for_stack
432 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
433 {
434         struct scrub_ctx *sctx;
435         int             i;
436         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
437         int pages_per_rd_bio;
438         int ret;
439
440         /*
441          * the setting of pages_per_rd_bio is correct for scrub but might
442          * be wrong for the dev_replace code where we might read from
443          * different devices in the initial huge bios. However, that
444          * code is able to correctly handle the case when adding a page
445          * to a bio fails.
446          */
447         if (dev->bdev)
448                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
449                                          bio_get_nr_vecs(dev->bdev));
450         else
451                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
452         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
453         if (!sctx)
454                 goto nomem;
455         sctx->is_dev_replace = is_dev_replace;
456         sctx->pages_per_rd_bio = pages_per_rd_bio;
457         sctx->curr = -1;
458         sctx->dev_root = dev->dev_root;
459         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
460                 struct scrub_bio *sbio;
461
462                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
463                 if (!sbio)
464                         goto nomem;
465                 sctx->bios[i] = sbio;
466
467                 sbio->index = i;
468                 sbio->sctx = sctx;
469                 sbio->page_count = 0;
470                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
471                                 scrub_bio_end_io_worker, NULL, NULL);
472
473                 if (i != SCRUB_BIOS_PER_SCTX - 1)
474                         sctx->bios[i]->next_free = i + 1;
475                 else
476                         sctx->bios[i]->next_free = -1;
477         }
478         sctx->first_free = 0;
479         sctx->nodesize = dev->dev_root->nodesize;
480         sctx->sectorsize = dev->dev_root->sectorsize;
481         atomic_set(&sctx->bios_in_flight, 0);
482         atomic_set(&sctx->workers_pending, 0);
483         atomic_set(&sctx->cancel_req, 0);
484         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
485         INIT_LIST_HEAD(&sctx->csum_list);
486
487         spin_lock_init(&sctx->list_lock);
488         spin_lock_init(&sctx->stat_lock);
489         init_waitqueue_head(&sctx->list_wait);
490
491         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
492                                  fs_info->dev_replace.tgtdev, is_dev_replace);
493         if (ret) {
494                 scrub_free_ctx(sctx);
495                 return ERR_PTR(ret);
496         }
497         return sctx;
498
499 nomem:
500         scrub_free_ctx(sctx);
501         return ERR_PTR(-ENOMEM);
502 }
503
504 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
505                                      void *warn_ctx)
506 {
507         u64 isize;
508         u32 nlink;
509         int ret;
510         int i;
511         struct extent_buffer *eb;
512         struct btrfs_inode_item *inode_item;
513         struct scrub_warning *swarn = warn_ctx;
514         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
515         struct inode_fs_paths *ipath = NULL;
516         struct btrfs_root *local_root;
517         struct btrfs_key root_key;
518         struct btrfs_key key;
519
520         root_key.objectid = root;
521         root_key.type = BTRFS_ROOT_ITEM_KEY;
522         root_key.offset = (u64)-1;
523         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
524         if (IS_ERR(local_root)) {
525                 ret = PTR_ERR(local_root);
526                 goto err;
527         }
528
529         /*
530          * this makes the path point to (inum INODE_ITEM ioff)
531          */
532         key.objectid = inum;
533         key.type = BTRFS_INODE_ITEM_KEY;
534         key.offset = 0;
535
536         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
537         if (ret) {
538                 btrfs_release_path(swarn->path);
539                 goto err;
540         }
541
542         eb = swarn->path->nodes[0];
543         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
544                                         struct btrfs_inode_item);
545         isize = btrfs_inode_size(eb, inode_item);
546         nlink = btrfs_inode_nlink(eb, inode_item);
547         btrfs_release_path(swarn->path);
548
549         ipath = init_ipath(4096, local_root, swarn->path);
550         if (IS_ERR(ipath)) {
551                 ret = PTR_ERR(ipath);
552                 ipath = NULL;
553                 goto err;
554         }
555         ret = paths_from_inode(inum, ipath);
556
557         if (ret < 0)
558                 goto err;
559
560         /*
561          * we deliberately ignore the bit ipath might have been too small to
562          * hold all of the paths here
563          */
564         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
565                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
566                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
567                         "length %llu, links %u (path: %s)\n", swarn->errstr,
568                         swarn->logical, rcu_str_deref(swarn->dev->name),
569                         (unsigned long long)swarn->sector, root, inum, offset,
570                         min(isize - offset, (u64)PAGE_SIZE), nlink,
571                         (char *)(unsigned long)ipath->fspath->val[i]);
572
573         free_ipath(ipath);
574         return 0;
575
576 err:
577         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
578                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
579                 "resolving failed with ret=%d\n", swarn->errstr,
580                 swarn->logical, rcu_str_deref(swarn->dev->name),
581                 (unsigned long long)swarn->sector, root, inum, offset, ret);
582
583         free_ipath(ipath);
584         return 0;
585 }
586
587 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
588 {
589         struct btrfs_device *dev;
590         struct btrfs_fs_info *fs_info;
591         struct btrfs_path *path;
592         struct btrfs_key found_key;
593         struct extent_buffer *eb;
594         struct btrfs_extent_item *ei;
595         struct scrub_warning swarn;
596         unsigned long ptr = 0;
597         u64 extent_item_pos;
598         u64 flags = 0;
599         u64 ref_root;
600         u32 item_size;
601         u8 ref_level;
602         int ret;
603
604         WARN_ON(sblock->page_count < 1);
605         dev = sblock->pagev[0]->dev;
606         fs_info = sblock->sctx->dev_root->fs_info;
607
608         path = btrfs_alloc_path();
609         if (!path)
610                 return;
611
612         swarn.sector = (sblock->pagev[0]->physical) >> 9;
613         swarn.logical = sblock->pagev[0]->logical;
614         swarn.errstr = errstr;
615         swarn.dev = NULL;
616
617         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
618                                   &flags);
619         if (ret < 0)
620                 goto out;
621
622         extent_item_pos = swarn.logical - found_key.objectid;
623         swarn.extent_item_size = found_key.offset;
624
625         eb = path->nodes[0];
626         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
627         item_size = btrfs_item_size_nr(eb, path->slots[0]);
628
629         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
630                 do {
631                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
632                                                       item_size, &ref_root,
633                                                       &ref_level);
634                         printk_in_rcu(KERN_WARNING
635                                 "BTRFS: %s at logical %llu on dev %s, "
636                                 "sector %llu: metadata %s (level %d) in tree "
637                                 "%llu\n", errstr, swarn.logical,
638                                 rcu_str_deref(dev->name),
639                                 (unsigned long long)swarn.sector,
640                                 ref_level ? "node" : "leaf",
641                                 ret < 0 ? -1 : ref_level,
642                                 ret < 0 ? -1 : ref_root);
643                 } while (ret != 1);
644                 btrfs_release_path(path);
645         } else {
646                 btrfs_release_path(path);
647                 swarn.path = path;
648                 swarn.dev = dev;
649                 iterate_extent_inodes(fs_info, found_key.objectid,
650                                         extent_item_pos, 1,
651                                         scrub_print_warning_inode, &swarn);
652         }
653
654 out:
655         btrfs_free_path(path);
656 }
657
658 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
659 {
660         struct page *page = NULL;
661         unsigned long index;
662         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
663         int ret;
664         int corrected = 0;
665         struct btrfs_key key;
666         struct inode *inode = NULL;
667         struct btrfs_fs_info *fs_info;
668         u64 end = offset + PAGE_SIZE - 1;
669         struct btrfs_root *local_root;
670         int srcu_index;
671
672         key.objectid = root;
673         key.type = BTRFS_ROOT_ITEM_KEY;
674         key.offset = (u64)-1;
675
676         fs_info = fixup->root->fs_info;
677         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
678
679         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
680         if (IS_ERR(local_root)) {
681                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
682                 return PTR_ERR(local_root);
683         }
684
685         key.type = BTRFS_INODE_ITEM_KEY;
686         key.objectid = inum;
687         key.offset = 0;
688         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
689         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
690         if (IS_ERR(inode))
691                 return PTR_ERR(inode);
692
693         index = offset >> PAGE_CACHE_SHIFT;
694
695         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
696         if (!page) {
697                 ret = -ENOMEM;
698                 goto out;
699         }
700
701         if (PageUptodate(page)) {
702                 if (PageDirty(page)) {
703                         /*
704                          * we need to write the data to the defect sector. the
705                          * data that was in that sector is not in memory,
706                          * because the page was modified. we must not write the
707                          * modified page to that sector.
708                          *
709                          * TODO: what could be done here: wait for the delalloc
710                          *       runner to write out that page (might involve
711                          *       COW) and see whether the sector is still
712                          *       referenced afterwards.
713                          *
714                          * For the meantime, we'll treat this error
715                          * incorrectable, although there is a chance that a
716                          * later scrub will find the bad sector again and that
717                          * there's no dirty page in memory, then.
718                          */
719                         ret = -EIO;
720                         goto out;
721                 }
722                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
723                                         fixup->logical, page,
724                                         offset - page_offset(page),
725                                         fixup->mirror_num);
726                 unlock_page(page);
727                 corrected = !ret;
728         } else {
729                 /*
730                  * we need to get good data first. the general readpage path
731                  * will call repair_io_failure for us, we just have to make
732                  * sure we read the bad mirror.
733                  */
734                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
735                                         EXTENT_DAMAGED, GFP_NOFS);
736                 if (ret) {
737                         /* set_extent_bits should give proper error */
738                         WARN_ON(ret > 0);
739                         if (ret > 0)
740                                 ret = -EFAULT;
741                         goto out;
742                 }
743
744                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
745                                                 btrfs_get_extent,
746                                                 fixup->mirror_num);
747                 wait_on_page_locked(page);
748
749                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
750                                                 end, EXTENT_DAMAGED, 0, NULL);
751                 if (!corrected)
752                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
753                                                 EXTENT_DAMAGED, GFP_NOFS);
754         }
755
756 out:
757         if (page)
758                 put_page(page);
759
760         iput(inode);
761
762         if (ret < 0)
763                 return ret;
764
765         if (ret == 0 && corrected) {
766                 /*
767                  * we only need to call readpage for one of the inodes belonging
768                  * to this extent. so make iterate_extent_inodes stop
769                  */
770                 return 1;
771         }
772
773         return -EIO;
774 }
775
776 static void scrub_fixup_nodatasum(struct btrfs_work *work)
777 {
778         int ret;
779         struct scrub_fixup_nodatasum *fixup;
780         struct scrub_ctx *sctx;
781         struct btrfs_trans_handle *trans = NULL;
782         struct btrfs_path *path;
783         int uncorrectable = 0;
784
785         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
786         sctx = fixup->sctx;
787
788         path = btrfs_alloc_path();
789         if (!path) {
790                 spin_lock(&sctx->stat_lock);
791                 ++sctx->stat.malloc_errors;
792                 spin_unlock(&sctx->stat_lock);
793                 uncorrectable = 1;
794                 goto out;
795         }
796
797         trans = btrfs_join_transaction(fixup->root);
798         if (IS_ERR(trans)) {
799                 uncorrectable = 1;
800                 goto out;
801         }
802
803         /*
804          * the idea is to trigger a regular read through the standard path. we
805          * read a page from the (failed) logical address by specifying the
806          * corresponding copynum of the failed sector. thus, that readpage is
807          * expected to fail.
808          * that is the point where on-the-fly error correction will kick in
809          * (once it's finished) and rewrite the failed sector if a good copy
810          * can be found.
811          */
812         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
813                                                 path, scrub_fixup_readpage,
814                                                 fixup);
815         if (ret < 0) {
816                 uncorrectable = 1;
817                 goto out;
818         }
819         WARN_ON(ret != 1);
820
821         spin_lock(&sctx->stat_lock);
822         ++sctx->stat.corrected_errors;
823         spin_unlock(&sctx->stat_lock);
824
825 out:
826         if (trans && !IS_ERR(trans))
827                 btrfs_end_transaction(trans, fixup->root);
828         if (uncorrectable) {
829                 spin_lock(&sctx->stat_lock);
830                 ++sctx->stat.uncorrectable_errors;
831                 spin_unlock(&sctx->stat_lock);
832                 btrfs_dev_replace_stats_inc(
833                         &sctx->dev_root->fs_info->dev_replace.
834                         num_uncorrectable_read_errors);
835                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
836                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
837                         fixup->logical, rcu_str_deref(fixup->dev->name));
838         }
839
840         btrfs_free_path(path);
841         kfree(fixup);
842
843         scrub_pending_trans_workers_dec(sctx);
844 }
845
846 static inline void scrub_get_recover(struct scrub_recover *recover)
847 {
848         atomic_inc(&recover->refs);
849 }
850
851 static inline void scrub_put_recover(struct scrub_recover *recover)
852 {
853         if (atomic_dec_and_test(&recover->refs)) {
854                 btrfs_put_bbio(recover->bbio);
855                 kfree(recover);
856         }
857 }
858
859 /*
860  * scrub_handle_errored_block gets called when either verification of the
861  * pages failed or the bio failed to read, e.g. with EIO. In the latter
862  * case, this function handles all pages in the bio, even though only one
863  * may be bad.
864  * The goal of this function is to repair the errored block by using the
865  * contents of one of the mirrors.
866  */
867 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
868 {
869         struct scrub_ctx *sctx = sblock_to_check->sctx;
870         struct btrfs_device *dev;
871         struct btrfs_fs_info *fs_info;
872         u64 length;
873         u64 logical;
874         u64 generation;
875         unsigned int failed_mirror_index;
876         unsigned int is_metadata;
877         unsigned int have_csum;
878         u8 *csum;
879         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
880         struct scrub_block *sblock_bad;
881         int ret;
882         int mirror_index;
883         int page_num;
884         int success;
885         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
886                                       DEFAULT_RATELIMIT_BURST);
887
888         BUG_ON(sblock_to_check->page_count < 1);
889         fs_info = sctx->dev_root->fs_info;
890         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
891                 /*
892                  * if we find an error in a super block, we just report it.
893                  * They will get written with the next transaction commit
894                  * anyway
895                  */
896                 spin_lock(&sctx->stat_lock);
897                 ++sctx->stat.super_errors;
898                 spin_unlock(&sctx->stat_lock);
899                 return 0;
900         }
901         length = sblock_to_check->page_count * PAGE_SIZE;
902         logical = sblock_to_check->pagev[0]->logical;
903         generation = sblock_to_check->pagev[0]->generation;
904         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
905         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
906         is_metadata = !(sblock_to_check->pagev[0]->flags &
907                         BTRFS_EXTENT_FLAG_DATA);
908         have_csum = sblock_to_check->pagev[0]->have_csum;
909         csum = sblock_to_check->pagev[0]->csum;
910         dev = sblock_to_check->pagev[0]->dev;
911
912         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
913                 sblocks_for_recheck = NULL;
914                 goto nodatasum_case;
915         }
916
917         /*
918          * read all mirrors one after the other. This includes to
919          * re-read the extent or metadata block that failed (that was
920          * the cause that this fixup code is called) another time,
921          * page by page this time in order to know which pages
922          * caused I/O errors and which ones are good (for all mirrors).
923          * It is the goal to handle the situation when more than one
924          * mirror contains I/O errors, but the errors do not
925          * overlap, i.e. the data can be repaired by selecting the
926          * pages from those mirrors without I/O error on the
927          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
928          * would be that mirror #1 has an I/O error on the first page,
929          * the second page is good, and mirror #2 has an I/O error on
930          * the second page, but the first page is good.
931          * Then the first page of the first mirror can be repaired by
932          * taking the first page of the second mirror, and the
933          * second page of the second mirror can be repaired by
934          * copying the contents of the 2nd page of the 1st mirror.
935          * One more note: if the pages of one mirror contain I/O
936          * errors, the checksum cannot be verified. In order to get
937          * the best data for repairing, the first attempt is to find
938          * a mirror without I/O errors and with a validated checksum.
939          * Only if this is not possible, the pages are picked from
940          * mirrors with I/O errors without considering the checksum.
941          * If the latter is the case, at the end, the checksum of the
942          * repaired area is verified in order to correctly maintain
943          * the statistics.
944          */
945
946         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
947                                      sizeof(*sblocks_for_recheck),
948                                      GFP_NOFS);
949         if (!sblocks_for_recheck) {
950                 spin_lock(&sctx->stat_lock);
951                 sctx->stat.malloc_errors++;
952                 sctx->stat.read_errors++;
953                 sctx->stat.uncorrectable_errors++;
954                 spin_unlock(&sctx->stat_lock);
955                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
956                 goto out;
957         }
958
959         /* setup the context, map the logical blocks and alloc the pages */
960         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
961         if (ret) {
962                 spin_lock(&sctx->stat_lock);
963                 sctx->stat.read_errors++;
964                 sctx->stat.uncorrectable_errors++;
965                 spin_unlock(&sctx->stat_lock);
966                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
967                 goto out;
968         }
969         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
970         sblock_bad = sblocks_for_recheck + failed_mirror_index;
971
972         /* build and submit the bios for the failed mirror, check checksums */
973         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
974                             csum, generation, sctx->csum_size, 1);
975
976         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
977             sblock_bad->no_io_error_seen) {
978                 /*
979                  * the error disappeared after reading page by page, or
980                  * the area was part of a huge bio and other parts of the
981                  * bio caused I/O errors, or the block layer merged several
982                  * read requests into one and the error is caused by a
983                  * different bio (usually one of the two latter cases is
984                  * the cause)
985                  */
986                 spin_lock(&sctx->stat_lock);
987                 sctx->stat.unverified_errors++;
988                 sblock_to_check->data_corrected = 1;
989                 spin_unlock(&sctx->stat_lock);
990
991                 if (sctx->is_dev_replace)
992                         scrub_write_block_to_dev_replace(sblock_bad);
993                 goto out;
994         }
995
996         if (!sblock_bad->no_io_error_seen) {
997                 spin_lock(&sctx->stat_lock);
998                 sctx->stat.read_errors++;
999                 spin_unlock(&sctx->stat_lock);
1000                 if (__ratelimit(&_rs))
1001                         scrub_print_warning("i/o error", sblock_to_check);
1002                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1003         } else if (sblock_bad->checksum_error) {
1004                 spin_lock(&sctx->stat_lock);
1005                 sctx->stat.csum_errors++;
1006                 spin_unlock(&sctx->stat_lock);
1007                 if (__ratelimit(&_rs))
1008                         scrub_print_warning("checksum error", sblock_to_check);
1009                 btrfs_dev_stat_inc_and_print(dev,
1010                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1011         } else if (sblock_bad->header_error) {
1012                 spin_lock(&sctx->stat_lock);
1013                 sctx->stat.verify_errors++;
1014                 spin_unlock(&sctx->stat_lock);
1015                 if (__ratelimit(&_rs))
1016                         scrub_print_warning("checksum/header error",
1017                                             sblock_to_check);
1018                 if (sblock_bad->generation_error)
1019                         btrfs_dev_stat_inc_and_print(dev,
1020                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1021                 else
1022                         btrfs_dev_stat_inc_and_print(dev,
1023                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1024         }
1025
1026         if (sctx->readonly) {
1027                 ASSERT(!sctx->is_dev_replace);
1028                 goto out;
1029         }
1030
1031         if (!is_metadata && !have_csum) {
1032                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1033
1034                 WARN_ON(sctx->is_dev_replace);
1035
1036 nodatasum_case:
1037
1038                 /*
1039                  * !is_metadata and !have_csum, this means that the data
1040                  * might not be COW'ed, that it might be modified
1041                  * concurrently. The general strategy to work on the
1042                  * commit root does not help in the case when COW is not
1043                  * used.
1044                  */
1045                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1046                 if (!fixup_nodatasum)
1047                         goto did_not_correct_error;
1048                 fixup_nodatasum->sctx = sctx;
1049                 fixup_nodatasum->dev = dev;
1050                 fixup_nodatasum->logical = logical;
1051                 fixup_nodatasum->root = fs_info->extent_root;
1052                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1053                 scrub_pending_trans_workers_inc(sctx);
1054                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1055                                 scrub_fixup_nodatasum, NULL, NULL);
1056                 btrfs_queue_work(fs_info->scrub_workers,
1057                                  &fixup_nodatasum->work);
1058                 goto out;
1059         }
1060
1061         /*
1062          * now build and submit the bios for the other mirrors, check
1063          * checksums.
1064          * First try to pick the mirror which is completely without I/O
1065          * errors and also does not have a checksum error.
1066          * If one is found, and if a checksum is present, the full block
1067          * that is known to contain an error is rewritten. Afterwards
1068          * the block is known to be corrected.
1069          * If a mirror is found which is completely correct, and no
1070          * checksum is present, only those pages are rewritten that had
1071          * an I/O error in the block to be repaired, since it cannot be
1072          * determined, which copy of the other pages is better (and it
1073          * could happen otherwise that a correct page would be
1074          * overwritten by a bad one).
1075          */
1076         for (mirror_index = 0;
1077              mirror_index < BTRFS_MAX_MIRRORS &&
1078              sblocks_for_recheck[mirror_index].page_count > 0;
1079              mirror_index++) {
1080                 struct scrub_block *sblock_other;
1081
1082                 if (mirror_index == failed_mirror_index)
1083                         continue;
1084                 sblock_other = sblocks_for_recheck + mirror_index;
1085
1086                 /* build and submit the bios, check checksums */
1087                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1088                                     have_csum, csum, generation,
1089                                     sctx->csum_size, 0);
1090
1091                 if (!sblock_other->header_error &&
1092                     !sblock_other->checksum_error &&
1093                     sblock_other->no_io_error_seen) {
1094                         if (sctx->is_dev_replace) {
1095                                 scrub_write_block_to_dev_replace(sblock_other);
1096                                 goto corrected_error;
1097                         } else {
1098                                 ret = scrub_repair_block_from_good_copy(
1099                                                 sblock_bad, sblock_other);
1100                                 if (!ret)
1101                                         goto corrected_error;
1102                         }
1103                 }
1104         }
1105
1106         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1107                 goto did_not_correct_error;
1108
1109         /*
1110          * In case of I/O errors in the area that is supposed to be
1111          * repaired, continue by picking good copies of those pages.
1112          * Select the good pages from mirrors to rewrite bad pages from
1113          * the area to fix. Afterwards verify the checksum of the block
1114          * that is supposed to be repaired. This verification step is
1115          * only done for the purpose of statistic counting and for the
1116          * final scrub report, whether errors remain.
1117          * A perfect algorithm could make use of the checksum and try
1118          * all possible combinations of pages from the different mirrors
1119          * until the checksum verification succeeds. For example, when
1120          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1121          * of mirror #2 is readable but the final checksum test fails,
1122          * then the 2nd page of mirror #3 could be tried, whether now
1123          * the final checksum succeedes. But this would be a rare
1124          * exception and is therefore not implemented. At least it is
1125          * avoided that the good copy is overwritten.
1126          * A more useful improvement would be to pick the sectors
1127          * without I/O error based on sector sizes (512 bytes on legacy
1128          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1129          * mirror could be repaired by taking 512 byte of a different
1130          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1131          * area are unreadable.
1132          */
1133         success = 1;
1134         for (page_num = 0; page_num < sblock_bad->page_count;
1135              page_num++) {
1136                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1137                 struct scrub_block *sblock_other = NULL;
1138
1139                 /* skip no-io-error page in scrub */
1140                 if (!page_bad->io_error && !sctx->is_dev_replace)
1141                         continue;
1142
1143                 /* try to find no-io-error page in mirrors */
1144                 if (page_bad->io_error) {
1145                         for (mirror_index = 0;
1146                              mirror_index < BTRFS_MAX_MIRRORS &&
1147                              sblocks_for_recheck[mirror_index].page_count > 0;
1148                              mirror_index++) {
1149                                 if (!sblocks_for_recheck[mirror_index].
1150                                     pagev[page_num]->io_error) {
1151                                         sblock_other = sblocks_for_recheck +
1152                                                        mirror_index;
1153                                         break;
1154                                 }
1155                         }
1156                         if (!sblock_other)
1157                                 success = 0;
1158                 }
1159
1160                 if (sctx->is_dev_replace) {
1161                         /*
1162                          * did not find a mirror to fetch the page
1163                          * from. scrub_write_page_to_dev_replace()
1164                          * handles this case (page->io_error), by
1165                          * filling the block with zeros before
1166                          * submitting the write request
1167                          */
1168                         if (!sblock_other)
1169                                 sblock_other = sblock_bad;
1170
1171                         if (scrub_write_page_to_dev_replace(sblock_other,
1172                                                             page_num) != 0) {
1173                                 btrfs_dev_replace_stats_inc(
1174                                         &sctx->dev_root->
1175                                         fs_info->dev_replace.
1176                                         num_write_errors);
1177                                 success = 0;
1178                         }
1179                 } else if (sblock_other) {
1180                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1181                                                                sblock_other,
1182                                                                page_num, 0);
1183                         if (0 == ret)
1184                                 page_bad->io_error = 0;
1185                         else
1186                                 success = 0;
1187                 }
1188         }
1189
1190         if (success && !sctx->is_dev_replace) {
1191                 if (is_metadata || have_csum) {
1192                         /*
1193                          * need to verify the checksum now that all
1194                          * sectors on disk are repaired (the write
1195                          * request for data to be repaired is on its way).
1196                          * Just be lazy and use scrub_recheck_block()
1197                          * which re-reads the data before the checksum
1198                          * is verified, but most likely the data comes out
1199                          * of the page cache.
1200                          */
1201                         scrub_recheck_block(fs_info, sblock_bad,
1202                                             is_metadata, have_csum, csum,
1203                                             generation, sctx->csum_size, 1);
1204                         if (!sblock_bad->header_error &&
1205                             !sblock_bad->checksum_error &&
1206                             sblock_bad->no_io_error_seen)
1207                                 goto corrected_error;
1208                         else
1209                                 goto did_not_correct_error;
1210                 } else {
1211 corrected_error:
1212                         spin_lock(&sctx->stat_lock);
1213                         sctx->stat.corrected_errors++;
1214                         sblock_to_check->data_corrected = 1;
1215                         spin_unlock(&sctx->stat_lock);
1216                         printk_ratelimited_in_rcu(KERN_ERR
1217                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1218                                 logical, rcu_str_deref(dev->name));
1219                 }
1220         } else {
1221 did_not_correct_error:
1222                 spin_lock(&sctx->stat_lock);
1223                 sctx->stat.uncorrectable_errors++;
1224                 spin_unlock(&sctx->stat_lock);
1225                 printk_ratelimited_in_rcu(KERN_ERR
1226                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1227                         logical, rcu_str_deref(dev->name));
1228         }
1229
1230 out:
1231         if (sblocks_for_recheck) {
1232                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1233                      mirror_index++) {
1234                         struct scrub_block *sblock = sblocks_for_recheck +
1235                                                      mirror_index;
1236                         struct scrub_recover *recover;
1237                         int page_index;
1238
1239                         for (page_index = 0; page_index < sblock->page_count;
1240                              page_index++) {
1241                                 sblock->pagev[page_index]->sblock = NULL;
1242                                 recover = sblock->pagev[page_index]->recover;
1243                                 if (recover) {
1244                                         scrub_put_recover(recover);
1245                                         sblock->pagev[page_index]->recover =
1246                                                                         NULL;
1247                                 }
1248                                 scrub_page_put(sblock->pagev[page_index]);
1249                         }
1250                 }
1251                 kfree(sblocks_for_recheck);
1252         }
1253
1254         return 0;
1255 }
1256
1257 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1258 {
1259         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1260                 return 2;
1261         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1262                 return 3;
1263         else
1264                 return (int)bbio->num_stripes;
1265 }
1266
1267 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1268                                                  u64 *raid_map,
1269                                                  u64 mapped_length,
1270                                                  int nstripes, int mirror,
1271                                                  int *stripe_index,
1272                                                  u64 *stripe_offset)
1273 {
1274         int i;
1275
1276         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1277                 /* RAID5/6 */
1278                 for (i = 0; i < nstripes; i++) {
1279                         if (raid_map[i] == RAID6_Q_STRIPE ||
1280                             raid_map[i] == RAID5_P_STRIPE)
1281                                 continue;
1282
1283                         if (logical >= raid_map[i] &&
1284                             logical < raid_map[i] + mapped_length)
1285                                 break;
1286                 }
1287
1288                 *stripe_index = i;
1289                 *stripe_offset = logical - raid_map[i];
1290         } else {
1291                 /* The other RAID type */
1292                 *stripe_index = mirror;
1293                 *stripe_offset = 0;
1294         }
1295 }
1296
1297 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1298                                      struct scrub_block *sblocks_for_recheck)
1299 {
1300         struct scrub_ctx *sctx = original_sblock->sctx;
1301         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1302         u64 length = original_sblock->page_count * PAGE_SIZE;
1303         u64 logical = original_sblock->pagev[0]->logical;
1304         struct scrub_recover *recover;
1305         struct btrfs_bio *bbio;
1306         u64 sublen;
1307         u64 mapped_length;
1308         u64 stripe_offset;
1309         int stripe_index;
1310         int page_index = 0;
1311         int mirror_index;
1312         int nmirrors;
1313         int ret;
1314
1315         /*
1316          * note: the two members refs and outstanding_pages
1317          * are not used (and not set) in the blocks that are used for
1318          * the recheck procedure
1319          */
1320
1321         while (length > 0) {
1322                 sublen = min_t(u64, length, PAGE_SIZE);
1323                 mapped_length = sublen;
1324                 bbio = NULL;
1325
1326                 /*
1327                  * with a length of PAGE_SIZE, each returned stripe
1328                  * represents one mirror
1329                  */
1330                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1331                                        &mapped_length, &bbio, 0, 1);
1332                 if (ret || !bbio || mapped_length < sublen) {
1333                         btrfs_put_bbio(bbio);
1334                         return -EIO;
1335                 }
1336
1337                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1338                 if (!recover) {
1339                         btrfs_put_bbio(bbio);
1340                         return -ENOMEM;
1341                 }
1342
1343                 atomic_set(&recover->refs, 1);
1344                 recover->bbio = bbio;
1345                 recover->map_length = mapped_length;
1346
1347                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1348
1349                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1350
1351                 for (mirror_index = 0; mirror_index < nmirrors;
1352                      mirror_index++) {
1353                         struct scrub_block *sblock;
1354                         struct scrub_page *page;
1355
1356                         sblock = sblocks_for_recheck + mirror_index;
1357                         sblock->sctx = sctx;
1358                         page = kzalloc(sizeof(*page), GFP_NOFS);
1359                         if (!page) {
1360 leave_nomem:
1361                                 spin_lock(&sctx->stat_lock);
1362                                 sctx->stat.malloc_errors++;
1363                                 spin_unlock(&sctx->stat_lock);
1364                                 scrub_put_recover(recover);
1365                                 return -ENOMEM;
1366                         }
1367                         scrub_page_get(page);
1368                         sblock->pagev[page_index] = page;
1369                         page->logical = logical;
1370
1371                         scrub_stripe_index_and_offset(logical,
1372                                                       bbio->map_type,
1373                                                       bbio->raid_map,
1374                                                       mapped_length,
1375                                                       bbio->num_stripes -
1376                                                       bbio->num_tgtdevs,
1377                                                       mirror_index,
1378                                                       &stripe_index,
1379                                                       &stripe_offset);
1380                         page->physical = bbio->stripes[stripe_index].physical +
1381                                          stripe_offset;
1382                         page->dev = bbio->stripes[stripe_index].dev;
1383
1384                         BUG_ON(page_index >= original_sblock->page_count);
1385                         page->physical_for_dev_replace =
1386                                 original_sblock->pagev[page_index]->
1387                                 physical_for_dev_replace;
1388                         /* for missing devices, dev->bdev is NULL */
1389                         page->mirror_num = mirror_index + 1;
1390                         sblock->page_count++;
1391                         page->page = alloc_page(GFP_NOFS);
1392                         if (!page->page)
1393                                 goto leave_nomem;
1394
1395                         scrub_get_recover(recover);
1396                         page->recover = recover;
1397                 }
1398                 scrub_put_recover(recover);
1399                 length -= sublen;
1400                 logical += sublen;
1401                 page_index++;
1402         }
1403
1404         return 0;
1405 }
1406
1407 struct scrub_bio_ret {
1408         struct completion event;
1409         int error;
1410 };
1411
1412 static void scrub_bio_wait_endio(struct bio *bio, int error)
1413 {
1414         struct scrub_bio_ret *ret = bio->bi_private;
1415
1416         ret->error = error;
1417         complete(&ret->event);
1418 }
1419
1420 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1421 {
1422         return page->recover &&
1423                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1424 }
1425
1426 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1427                                         struct bio *bio,
1428                                         struct scrub_page *page)
1429 {
1430         struct scrub_bio_ret done;
1431         int ret;
1432
1433         init_completion(&done.event);
1434         done.error = 0;
1435         bio->bi_iter.bi_sector = page->logical >> 9;
1436         bio->bi_private = &done;
1437         bio->bi_end_io = scrub_bio_wait_endio;
1438
1439         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1440                                     page->recover->map_length,
1441                                     page->mirror_num, 0);
1442         if (ret)
1443                 return ret;
1444
1445         wait_for_completion(&done.event);
1446         if (done.error)
1447                 return -EIO;
1448
1449         return 0;
1450 }
1451
1452 /*
1453  * this function will check the on disk data for checksum errors, header
1454  * errors and read I/O errors. If any I/O errors happen, the exact pages
1455  * which are errored are marked as being bad. The goal is to enable scrub
1456  * to take those pages that are not errored from all the mirrors so that
1457  * the pages that are errored in the just handled mirror can be repaired.
1458  */
1459 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1460                                 struct scrub_block *sblock, int is_metadata,
1461                                 int have_csum, u8 *csum, u64 generation,
1462                                 u16 csum_size, int retry_failed_mirror)
1463 {
1464         int page_num;
1465
1466         sblock->no_io_error_seen = 1;
1467         sblock->header_error = 0;
1468         sblock->checksum_error = 0;
1469
1470         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1471                 struct bio *bio;
1472                 struct scrub_page *page = sblock->pagev[page_num];
1473
1474                 if (page->dev->bdev == NULL) {
1475                         page->io_error = 1;
1476                         sblock->no_io_error_seen = 0;
1477                         continue;
1478                 }
1479
1480                 WARN_ON(!page->page);
1481                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1482                 if (!bio) {
1483                         page->io_error = 1;
1484                         sblock->no_io_error_seen = 0;
1485                         continue;
1486                 }
1487                 bio->bi_bdev = page->dev->bdev;
1488
1489                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1490                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1491                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1492                                 sblock->no_io_error_seen = 0;
1493                 } else {
1494                         bio->bi_iter.bi_sector = page->physical >> 9;
1495
1496                         if (btrfsic_submit_bio_wait(READ, bio))
1497                                 sblock->no_io_error_seen = 0;
1498                 }
1499
1500                 bio_put(bio);
1501         }
1502
1503         if (sblock->no_io_error_seen)
1504                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1505                                              have_csum, csum, generation,
1506                                              csum_size);
1507
1508         return;
1509 }
1510
1511 static inline int scrub_check_fsid(u8 fsid[],
1512                                    struct scrub_page *spage)
1513 {
1514         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1515         int ret;
1516
1517         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1518         return !ret;
1519 }
1520
1521 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1522                                          struct scrub_block *sblock,
1523                                          int is_metadata, int have_csum,
1524                                          const u8 *csum, u64 generation,
1525                                          u16 csum_size)
1526 {
1527         int page_num;
1528         u8 calculated_csum[BTRFS_CSUM_SIZE];
1529         u32 crc = ~(u32)0;
1530         void *mapped_buffer;
1531
1532         WARN_ON(!sblock->pagev[0]->page);
1533         if (is_metadata) {
1534                 struct btrfs_header *h;
1535
1536                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1537                 h = (struct btrfs_header *)mapped_buffer;
1538
1539                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1540                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1541                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1542                            BTRFS_UUID_SIZE)) {
1543                         sblock->header_error = 1;
1544                 } else if (generation != btrfs_stack_header_generation(h)) {
1545                         sblock->header_error = 1;
1546                         sblock->generation_error = 1;
1547                 }
1548                 csum = h->csum;
1549         } else {
1550                 if (!have_csum)
1551                         return;
1552
1553                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1554         }
1555
1556         for (page_num = 0;;) {
1557                 if (page_num == 0 && is_metadata)
1558                         crc = btrfs_csum_data(
1559                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1560                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1561                 else
1562                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1563
1564                 kunmap_atomic(mapped_buffer);
1565                 page_num++;
1566                 if (page_num >= sblock->page_count)
1567                         break;
1568                 WARN_ON(!sblock->pagev[page_num]->page);
1569
1570                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1571         }
1572
1573         btrfs_csum_final(crc, calculated_csum);
1574         if (memcmp(calculated_csum, csum, csum_size))
1575                 sblock->checksum_error = 1;
1576 }
1577
1578 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1579                                              struct scrub_block *sblock_good)
1580 {
1581         int page_num;
1582         int ret = 0;
1583
1584         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1585                 int ret_sub;
1586
1587                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1588                                                            sblock_good,
1589                                                            page_num, 1);
1590                 if (ret_sub)
1591                         ret = ret_sub;
1592         }
1593
1594         return ret;
1595 }
1596
1597 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1598                                             struct scrub_block *sblock_good,
1599                                             int page_num, int force_write)
1600 {
1601         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1602         struct scrub_page *page_good = sblock_good->pagev[page_num];
1603
1604         BUG_ON(page_bad->page == NULL);
1605         BUG_ON(page_good->page == NULL);
1606         if (force_write || sblock_bad->header_error ||
1607             sblock_bad->checksum_error || page_bad->io_error) {
1608                 struct bio *bio;
1609                 int ret;
1610
1611                 if (!page_bad->dev->bdev) {
1612                         printk_ratelimited(KERN_WARNING "BTRFS: "
1613                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1614                                 "is unexpected!\n");
1615                         return -EIO;
1616                 }
1617
1618                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1619                 if (!bio)
1620                         return -EIO;
1621                 bio->bi_bdev = page_bad->dev->bdev;
1622                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1623
1624                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1625                 if (PAGE_SIZE != ret) {
1626                         bio_put(bio);
1627                         return -EIO;
1628                 }
1629
1630                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1631                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1632                                 BTRFS_DEV_STAT_WRITE_ERRS);
1633                         btrfs_dev_replace_stats_inc(
1634                                 &sblock_bad->sctx->dev_root->fs_info->
1635                                 dev_replace.num_write_errors);
1636                         bio_put(bio);
1637                         return -EIO;
1638                 }
1639                 bio_put(bio);
1640         }
1641
1642         return 0;
1643 }
1644
1645 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1646 {
1647         int page_num;
1648
1649         /*
1650          * This block is used for the check of the parity on the source device,
1651          * so the data needn't be written into the destination device.
1652          */
1653         if (sblock->sparity)
1654                 return;
1655
1656         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1657                 int ret;
1658
1659                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1660                 if (ret)
1661                         btrfs_dev_replace_stats_inc(
1662                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1663                                 num_write_errors);
1664         }
1665 }
1666
1667 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1668                                            int page_num)
1669 {
1670         struct scrub_page *spage = sblock->pagev[page_num];
1671
1672         BUG_ON(spage->page == NULL);
1673         if (spage->io_error) {
1674                 void *mapped_buffer = kmap_atomic(spage->page);
1675
1676                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1677                 flush_dcache_page(spage->page);
1678                 kunmap_atomic(mapped_buffer);
1679         }
1680         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1681 }
1682
1683 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1684                                     struct scrub_page *spage)
1685 {
1686         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1687         struct scrub_bio *sbio;
1688         int ret;
1689
1690         mutex_lock(&wr_ctx->wr_lock);
1691 again:
1692         if (!wr_ctx->wr_curr_bio) {
1693                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1694                                               GFP_NOFS);
1695                 if (!wr_ctx->wr_curr_bio) {
1696                         mutex_unlock(&wr_ctx->wr_lock);
1697                         return -ENOMEM;
1698                 }
1699                 wr_ctx->wr_curr_bio->sctx = sctx;
1700                 wr_ctx->wr_curr_bio->page_count = 0;
1701         }
1702         sbio = wr_ctx->wr_curr_bio;
1703         if (sbio->page_count == 0) {
1704                 struct bio *bio;
1705
1706                 sbio->physical = spage->physical_for_dev_replace;
1707                 sbio->logical = spage->logical;
1708                 sbio->dev = wr_ctx->tgtdev;
1709                 bio = sbio->bio;
1710                 if (!bio) {
1711                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1712                         if (!bio) {
1713                                 mutex_unlock(&wr_ctx->wr_lock);
1714                                 return -ENOMEM;
1715                         }
1716                         sbio->bio = bio;
1717                 }
1718
1719                 bio->bi_private = sbio;
1720                 bio->bi_end_io = scrub_wr_bio_end_io;
1721                 bio->bi_bdev = sbio->dev->bdev;
1722                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1723                 sbio->err = 0;
1724         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1725                    spage->physical_for_dev_replace ||
1726                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1727                    spage->logical) {
1728                 scrub_wr_submit(sctx);
1729                 goto again;
1730         }
1731
1732         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1733         if (ret != PAGE_SIZE) {
1734                 if (sbio->page_count < 1) {
1735                         bio_put(sbio->bio);
1736                         sbio->bio = NULL;
1737                         mutex_unlock(&wr_ctx->wr_lock);
1738                         return -EIO;
1739                 }
1740                 scrub_wr_submit(sctx);
1741                 goto again;
1742         }
1743
1744         sbio->pagev[sbio->page_count] = spage;
1745         scrub_page_get(spage);
1746         sbio->page_count++;
1747         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1748                 scrub_wr_submit(sctx);
1749         mutex_unlock(&wr_ctx->wr_lock);
1750
1751         return 0;
1752 }
1753
1754 static void scrub_wr_submit(struct scrub_ctx *sctx)
1755 {
1756         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1757         struct scrub_bio *sbio;
1758
1759         if (!wr_ctx->wr_curr_bio)
1760                 return;
1761
1762         sbio = wr_ctx->wr_curr_bio;
1763         wr_ctx->wr_curr_bio = NULL;
1764         WARN_ON(!sbio->bio->bi_bdev);
1765         scrub_pending_bio_inc(sctx);
1766         /* process all writes in a single worker thread. Then the block layer
1767          * orders the requests before sending them to the driver which
1768          * doubled the write performance on spinning disks when measured
1769          * with Linux 3.5 */
1770         btrfsic_submit_bio(WRITE, sbio->bio);
1771 }
1772
1773 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1774 {
1775         struct scrub_bio *sbio = bio->bi_private;
1776         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1777
1778         sbio->err = err;
1779         sbio->bio = bio;
1780
1781         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1782                          scrub_wr_bio_end_io_worker, NULL, NULL);
1783         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1784 }
1785
1786 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1787 {
1788         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1789         struct scrub_ctx *sctx = sbio->sctx;
1790         int i;
1791
1792         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1793         if (sbio->err) {
1794                 struct btrfs_dev_replace *dev_replace =
1795                         &sbio->sctx->dev_root->fs_info->dev_replace;
1796
1797                 for (i = 0; i < sbio->page_count; i++) {
1798                         struct scrub_page *spage = sbio->pagev[i];
1799
1800                         spage->io_error = 1;
1801                         btrfs_dev_replace_stats_inc(&dev_replace->
1802                                                     num_write_errors);
1803                 }
1804         }
1805
1806         for (i = 0; i < sbio->page_count; i++)
1807                 scrub_page_put(sbio->pagev[i]);
1808
1809         bio_put(sbio->bio);
1810         kfree(sbio);
1811         scrub_pending_bio_dec(sctx);
1812 }
1813
1814 static int scrub_checksum(struct scrub_block *sblock)
1815 {
1816         u64 flags;
1817         int ret;
1818
1819         WARN_ON(sblock->page_count < 1);
1820         flags = sblock->pagev[0]->flags;
1821         ret = 0;
1822         if (flags & BTRFS_EXTENT_FLAG_DATA)
1823                 ret = scrub_checksum_data(sblock);
1824         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1825                 ret = scrub_checksum_tree_block(sblock);
1826         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1827                 (void)scrub_checksum_super(sblock);
1828         else
1829                 WARN_ON(1);
1830         if (ret)
1831                 scrub_handle_errored_block(sblock);
1832
1833         return ret;
1834 }
1835
1836 static int scrub_checksum_data(struct scrub_block *sblock)
1837 {
1838         struct scrub_ctx *sctx = sblock->sctx;
1839         u8 csum[BTRFS_CSUM_SIZE];
1840         u8 *on_disk_csum;
1841         struct page *page;
1842         void *buffer;
1843         u32 crc = ~(u32)0;
1844         int fail = 0;
1845         u64 len;
1846         int index;
1847
1848         BUG_ON(sblock->page_count < 1);
1849         if (!sblock->pagev[0]->have_csum)
1850                 return 0;
1851
1852         on_disk_csum = sblock->pagev[0]->csum;
1853         page = sblock->pagev[0]->page;
1854         buffer = kmap_atomic(page);
1855
1856         len = sctx->sectorsize;
1857         index = 0;
1858         for (;;) {
1859                 u64 l = min_t(u64, len, PAGE_SIZE);
1860
1861                 crc = btrfs_csum_data(buffer, crc, l);
1862                 kunmap_atomic(buffer);
1863                 len -= l;
1864                 if (len == 0)
1865                         break;
1866                 index++;
1867                 BUG_ON(index >= sblock->page_count);
1868                 BUG_ON(!sblock->pagev[index]->page);
1869                 page = sblock->pagev[index]->page;
1870                 buffer = kmap_atomic(page);
1871         }
1872
1873         btrfs_csum_final(crc, csum);
1874         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1875                 fail = 1;
1876
1877         return fail;
1878 }
1879
1880 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1881 {
1882         struct scrub_ctx *sctx = sblock->sctx;
1883         struct btrfs_header *h;
1884         struct btrfs_root *root = sctx->dev_root;
1885         struct btrfs_fs_info *fs_info = root->fs_info;
1886         u8 calculated_csum[BTRFS_CSUM_SIZE];
1887         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1888         struct page *page;
1889         void *mapped_buffer;
1890         u64 mapped_size;
1891         void *p;
1892         u32 crc = ~(u32)0;
1893         int fail = 0;
1894         int crc_fail = 0;
1895         u64 len;
1896         int index;
1897
1898         BUG_ON(sblock->page_count < 1);
1899         page = sblock->pagev[0]->page;
1900         mapped_buffer = kmap_atomic(page);
1901         h = (struct btrfs_header *)mapped_buffer;
1902         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1903
1904         /*
1905          * we don't use the getter functions here, as we
1906          * a) don't have an extent buffer and
1907          * b) the page is already kmapped
1908          */
1909
1910         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1911                 ++fail;
1912
1913         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1914                 ++fail;
1915
1916         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1917                 ++fail;
1918
1919         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1920                    BTRFS_UUID_SIZE))
1921                 ++fail;
1922
1923         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1924         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1925         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1926         index = 0;
1927         for (;;) {
1928                 u64 l = min_t(u64, len, mapped_size);
1929
1930                 crc = btrfs_csum_data(p, crc, l);
1931                 kunmap_atomic(mapped_buffer);
1932                 len -= l;
1933                 if (len == 0)
1934                         break;
1935                 index++;
1936                 BUG_ON(index >= sblock->page_count);
1937                 BUG_ON(!sblock->pagev[index]->page);
1938                 page = sblock->pagev[index]->page;
1939                 mapped_buffer = kmap_atomic(page);
1940                 mapped_size = PAGE_SIZE;
1941                 p = mapped_buffer;
1942         }
1943
1944         btrfs_csum_final(crc, calculated_csum);
1945         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1946                 ++crc_fail;
1947
1948         return fail || crc_fail;
1949 }
1950
1951 static int scrub_checksum_super(struct scrub_block *sblock)
1952 {
1953         struct btrfs_super_block *s;
1954         struct scrub_ctx *sctx = sblock->sctx;
1955         u8 calculated_csum[BTRFS_CSUM_SIZE];
1956         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1957         struct page *page;
1958         void *mapped_buffer;
1959         u64 mapped_size;
1960         void *p;
1961         u32 crc = ~(u32)0;
1962         int fail_gen = 0;
1963         int fail_cor = 0;
1964         u64 len;
1965         int index;
1966
1967         BUG_ON(sblock->page_count < 1);
1968         page = sblock->pagev[0]->page;
1969         mapped_buffer = kmap_atomic(page);
1970         s = (struct btrfs_super_block *)mapped_buffer;
1971         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1972
1973         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1974                 ++fail_cor;
1975
1976         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1977                 ++fail_gen;
1978
1979         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1980                 ++fail_cor;
1981
1982         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1983         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1984         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1985         index = 0;
1986         for (;;) {
1987                 u64 l = min_t(u64, len, mapped_size);
1988
1989                 crc = btrfs_csum_data(p, crc, l);
1990                 kunmap_atomic(mapped_buffer);
1991                 len -= l;
1992                 if (len == 0)
1993                         break;
1994                 index++;
1995                 BUG_ON(index >= sblock->page_count);
1996                 BUG_ON(!sblock->pagev[index]->page);
1997                 page = sblock->pagev[index]->page;
1998                 mapped_buffer = kmap_atomic(page);
1999                 mapped_size = PAGE_SIZE;
2000                 p = mapped_buffer;
2001         }
2002
2003         btrfs_csum_final(crc, calculated_csum);
2004         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2005                 ++fail_cor;
2006
2007         if (fail_cor + fail_gen) {
2008                 /*
2009                  * if we find an error in a super block, we just report it.
2010                  * They will get written with the next transaction commit
2011                  * anyway
2012                  */
2013                 spin_lock(&sctx->stat_lock);
2014                 ++sctx->stat.super_errors;
2015                 spin_unlock(&sctx->stat_lock);
2016                 if (fail_cor)
2017                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2018                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2019                 else
2020                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2021                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2022         }
2023
2024         return fail_cor + fail_gen;
2025 }
2026
2027 static void scrub_block_get(struct scrub_block *sblock)
2028 {
2029         atomic_inc(&sblock->refs);
2030 }
2031
2032 static void scrub_block_put(struct scrub_block *sblock)
2033 {
2034         if (atomic_dec_and_test(&sblock->refs)) {
2035                 int i;
2036
2037                 if (sblock->sparity)
2038                         scrub_parity_put(sblock->sparity);
2039
2040                 for (i = 0; i < sblock->page_count; i++)
2041                         scrub_page_put(sblock->pagev[i]);
2042                 kfree(sblock);
2043         }
2044 }
2045
2046 static void scrub_page_get(struct scrub_page *spage)
2047 {
2048         atomic_inc(&spage->refs);
2049 }
2050
2051 static void scrub_page_put(struct scrub_page *spage)
2052 {
2053         if (atomic_dec_and_test(&spage->refs)) {
2054                 if (spage->page)
2055                         __free_page(spage->page);
2056                 kfree(spage);
2057         }
2058 }
2059
2060 static void scrub_submit(struct scrub_ctx *sctx)
2061 {
2062         struct scrub_bio *sbio;
2063
2064         if (sctx->curr == -1)
2065                 return;
2066
2067         sbio = sctx->bios[sctx->curr];
2068         sctx->curr = -1;
2069         scrub_pending_bio_inc(sctx);
2070
2071         if (!sbio->bio->bi_bdev) {
2072                 /*
2073                  * this case should not happen. If btrfs_map_block() is
2074                  * wrong, it could happen for dev-replace operations on
2075                  * missing devices when no mirrors are available, but in
2076                  * this case it should already fail the mount.
2077                  * This case is handled correctly (but _very_ slowly).
2078                  */
2079                 printk_ratelimited(KERN_WARNING
2080                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2081                 bio_endio(sbio->bio, -EIO);
2082         } else {
2083                 btrfsic_submit_bio(READ, sbio->bio);
2084         }
2085 }
2086
2087 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2088                                     struct scrub_page *spage)
2089 {
2090         struct scrub_block *sblock = spage->sblock;
2091         struct scrub_bio *sbio;
2092         int ret;
2093
2094 again:
2095         /*
2096          * grab a fresh bio or wait for one to become available
2097          */
2098         while (sctx->curr == -1) {
2099                 spin_lock(&sctx->list_lock);
2100                 sctx->curr = sctx->first_free;
2101                 if (sctx->curr != -1) {
2102                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2103                         sctx->bios[sctx->curr]->next_free = -1;
2104                         sctx->bios[sctx->curr]->page_count = 0;
2105                         spin_unlock(&sctx->list_lock);
2106                 } else {
2107                         spin_unlock(&sctx->list_lock);
2108                         wait_event(sctx->list_wait, sctx->first_free != -1);
2109                 }
2110         }
2111         sbio = sctx->bios[sctx->curr];
2112         if (sbio->page_count == 0) {
2113                 struct bio *bio;
2114
2115                 sbio->physical = spage->physical;
2116                 sbio->logical = spage->logical;
2117                 sbio->dev = spage->dev;
2118                 bio = sbio->bio;
2119                 if (!bio) {
2120                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2121                         if (!bio)
2122                                 return -ENOMEM;
2123                         sbio->bio = bio;
2124                 }
2125
2126                 bio->bi_private = sbio;
2127                 bio->bi_end_io = scrub_bio_end_io;
2128                 bio->bi_bdev = sbio->dev->bdev;
2129                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2130                 sbio->err = 0;
2131         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2132                    spage->physical ||
2133                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2134                    spage->logical ||
2135                    sbio->dev != spage->dev) {
2136                 scrub_submit(sctx);
2137                 goto again;
2138         }
2139
2140         sbio->pagev[sbio->page_count] = spage;
2141         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2142         if (ret != PAGE_SIZE) {
2143                 if (sbio->page_count < 1) {
2144                         bio_put(sbio->bio);
2145                         sbio->bio = NULL;
2146                         return -EIO;
2147                 }
2148                 scrub_submit(sctx);
2149                 goto again;
2150         }
2151
2152         scrub_block_get(sblock); /* one for the page added to the bio */
2153         atomic_inc(&sblock->outstanding_pages);
2154         sbio->page_count++;
2155         if (sbio->page_count == sctx->pages_per_rd_bio)
2156                 scrub_submit(sctx);
2157
2158         return 0;
2159 }
2160
2161 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2162                        u64 physical, struct btrfs_device *dev, u64 flags,
2163                        u64 gen, int mirror_num, u8 *csum, int force,
2164                        u64 physical_for_dev_replace)
2165 {
2166         struct scrub_block *sblock;
2167         int index;
2168
2169         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2170         if (!sblock) {
2171                 spin_lock(&sctx->stat_lock);
2172                 sctx->stat.malloc_errors++;
2173                 spin_unlock(&sctx->stat_lock);
2174                 return -ENOMEM;
2175         }
2176
2177         /* one ref inside this function, plus one for each page added to
2178          * a bio later on */
2179         atomic_set(&sblock->refs, 1);
2180         sblock->sctx = sctx;
2181         sblock->no_io_error_seen = 1;
2182
2183         for (index = 0; len > 0; index++) {
2184                 struct scrub_page *spage;
2185                 u64 l = min_t(u64, len, PAGE_SIZE);
2186
2187                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2188                 if (!spage) {
2189 leave_nomem:
2190                         spin_lock(&sctx->stat_lock);
2191                         sctx->stat.malloc_errors++;
2192                         spin_unlock(&sctx->stat_lock);
2193                         scrub_block_put(sblock);
2194                         return -ENOMEM;
2195                 }
2196                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2197                 scrub_page_get(spage);
2198                 sblock->pagev[index] = spage;
2199                 spage->sblock = sblock;
2200                 spage->dev = dev;
2201                 spage->flags = flags;
2202                 spage->generation = gen;
2203                 spage->logical = logical;
2204                 spage->physical = physical;
2205                 spage->physical_for_dev_replace = physical_for_dev_replace;
2206                 spage->mirror_num = mirror_num;
2207                 if (csum) {
2208                         spage->have_csum = 1;
2209                         memcpy(spage->csum, csum, sctx->csum_size);
2210                 } else {
2211                         spage->have_csum = 0;
2212                 }
2213                 sblock->page_count++;
2214                 spage->page = alloc_page(GFP_NOFS);
2215                 if (!spage->page)
2216                         goto leave_nomem;
2217                 len -= l;
2218                 logical += l;
2219                 physical += l;
2220                 physical_for_dev_replace += l;
2221         }
2222
2223         WARN_ON(sblock->page_count == 0);
2224         for (index = 0; index < sblock->page_count; index++) {
2225                 struct scrub_page *spage = sblock->pagev[index];
2226                 int ret;
2227
2228                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2229                 if (ret) {
2230                         scrub_block_put(sblock);
2231                         return ret;
2232                 }
2233         }
2234
2235         if (force)
2236                 scrub_submit(sctx);
2237
2238         /* last one frees, either here or in bio completion for last page */
2239         scrub_block_put(sblock);
2240         return 0;
2241 }
2242
2243 static void scrub_bio_end_io(struct bio *bio, int err)
2244 {
2245         struct scrub_bio *sbio = bio->bi_private;
2246         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2247
2248         sbio->err = err;
2249         sbio->bio = bio;
2250
2251         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2252 }
2253
2254 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2255 {
2256         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2257         struct scrub_ctx *sctx = sbio->sctx;
2258         int i;
2259
2260         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2261         if (sbio->err) {
2262                 for (i = 0; i < sbio->page_count; i++) {
2263                         struct scrub_page *spage = sbio->pagev[i];
2264
2265                         spage->io_error = 1;
2266                         spage->sblock->no_io_error_seen = 0;
2267                 }
2268         }
2269
2270         /* now complete the scrub_block items that have all pages completed */
2271         for (i = 0; i < sbio->page_count; i++) {
2272                 struct scrub_page *spage = sbio->pagev[i];
2273                 struct scrub_block *sblock = spage->sblock;
2274
2275                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2276                         scrub_block_complete(sblock);
2277                 scrub_block_put(sblock);
2278         }
2279
2280         bio_put(sbio->bio);
2281         sbio->bio = NULL;
2282         spin_lock(&sctx->list_lock);
2283         sbio->next_free = sctx->first_free;
2284         sctx->first_free = sbio->index;
2285         spin_unlock(&sctx->list_lock);
2286
2287         if (sctx->is_dev_replace &&
2288             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2289                 mutex_lock(&sctx->wr_ctx.wr_lock);
2290                 scrub_wr_submit(sctx);
2291                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2292         }
2293
2294         scrub_pending_bio_dec(sctx);
2295 }
2296
2297 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2298                                        unsigned long *bitmap,
2299                                        u64 start, u64 len)
2300 {
2301         int offset;
2302         int nsectors;
2303         int sectorsize = sparity->sctx->dev_root->sectorsize;
2304
2305         if (len >= sparity->stripe_len) {
2306                 bitmap_set(bitmap, 0, sparity->nsectors);
2307                 return;
2308         }
2309
2310         start -= sparity->logic_start;
2311         offset = (int)do_div(start, sparity->stripe_len);
2312         offset /= sectorsize;
2313         nsectors = (int)len / sectorsize;
2314
2315         if (offset + nsectors <= sparity->nsectors) {
2316                 bitmap_set(bitmap, offset, nsectors);
2317                 return;
2318         }
2319
2320         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2321         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2322 }
2323
2324 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2325                                                    u64 start, u64 len)
2326 {
2327         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2328 }
2329
2330 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2331                                                   u64 start, u64 len)
2332 {
2333         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2334 }
2335
2336 static void scrub_block_complete(struct scrub_block *sblock)
2337 {
2338         int corrupted = 0;
2339
2340         if (!sblock->no_io_error_seen) {
2341                 corrupted = 1;
2342                 scrub_handle_errored_block(sblock);
2343         } else {
2344                 /*
2345                  * if has checksum error, write via repair mechanism in
2346                  * dev replace case, otherwise write here in dev replace
2347                  * case.
2348                  */
2349                 corrupted = scrub_checksum(sblock);
2350                 if (!corrupted && sblock->sctx->is_dev_replace)
2351                         scrub_write_block_to_dev_replace(sblock);
2352         }
2353
2354         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2355                 u64 start = sblock->pagev[0]->logical;
2356                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2357                           PAGE_SIZE;
2358
2359                 scrub_parity_mark_sectors_error(sblock->sparity,
2360                                                 start, end - start);
2361         }
2362 }
2363
2364 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2365                            u8 *csum)
2366 {
2367         struct btrfs_ordered_sum *sum = NULL;
2368         unsigned long index;
2369         unsigned long num_sectors;
2370
2371         while (!list_empty(&sctx->csum_list)) {
2372                 sum = list_first_entry(&sctx->csum_list,
2373                                        struct btrfs_ordered_sum, list);
2374                 if (sum->bytenr > logical)
2375                         return 0;
2376                 if (sum->bytenr + sum->len > logical)
2377                         break;
2378
2379                 ++sctx->stat.csum_discards;
2380                 list_del(&sum->list);
2381                 kfree(sum);
2382                 sum = NULL;
2383         }
2384         if (!sum)
2385                 return 0;
2386
2387         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2388         num_sectors = sum->len / sctx->sectorsize;
2389         memcpy(csum, sum->sums + index, sctx->csum_size);
2390         if (index == num_sectors - 1) {
2391                 list_del(&sum->list);
2392                 kfree(sum);
2393         }
2394         return 1;
2395 }
2396
2397 /* scrub extent tries to collect up to 64 kB for each bio */
2398 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2399                         u64 physical, struct btrfs_device *dev, u64 flags,
2400                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2401 {
2402         int ret;
2403         u8 csum[BTRFS_CSUM_SIZE];
2404         u32 blocksize;
2405
2406         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2407                 blocksize = sctx->sectorsize;
2408                 spin_lock(&sctx->stat_lock);
2409                 sctx->stat.data_extents_scrubbed++;
2410                 sctx->stat.data_bytes_scrubbed += len;
2411                 spin_unlock(&sctx->stat_lock);
2412         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2413                 blocksize = sctx->nodesize;
2414                 spin_lock(&sctx->stat_lock);
2415                 sctx->stat.tree_extents_scrubbed++;
2416                 sctx->stat.tree_bytes_scrubbed += len;
2417                 spin_unlock(&sctx->stat_lock);
2418         } else {
2419                 blocksize = sctx->sectorsize;
2420                 WARN_ON(1);
2421         }
2422
2423         while (len) {
2424                 u64 l = min_t(u64, len, blocksize);
2425                 int have_csum = 0;
2426
2427                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2428                         /* push csums to sbio */
2429                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2430                         if (have_csum == 0)
2431                                 ++sctx->stat.no_csum;
2432                         if (sctx->is_dev_replace && !have_csum) {
2433                                 ret = copy_nocow_pages(sctx, logical, l,
2434                                                        mirror_num,
2435                                                       physical_for_dev_replace);
2436                                 goto behind_scrub_pages;
2437                         }
2438                 }
2439                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2440                                   mirror_num, have_csum ? csum : NULL, 0,
2441                                   physical_for_dev_replace);
2442 behind_scrub_pages:
2443                 if (ret)
2444                         return ret;
2445                 len -= l;
2446                 logical += l;
2447                 physical += l;
2448                 physical_for_dev_replace += l;
2449         }
2450         return 0;
2451 }
2452
2453 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2454                                   u64 logical, u64 len,
2455                                   u64 physical, struct btrfs_device *dev,
2456                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2457 {
2458         struct scrub_ctx *sctx = sparity->sctx;
2459         struct scrub_block *sblock;
2460         int index;
2461
2462         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2463         if (!sblock) {
2464                 spin_lock(&sctx->stat_lock);
2465                 sctx->stat.malloc_errors++;
2466                 spin_unlock(&sctx->stat_lock);
2467                 return -ENOMEM;
2468         }
2469
2470         /* one ref inside this function, plus one for each page added to
2471          * a bio later on */
2472         atomic_set(&sblock->refs, 1);
2473         sblock->sctx = sctx;
2474         sblock->no_io_error_seen = 1;
2475         sblock->sparity = sparity;
2476         scrub_parity_get(sparity);
2477
2478         for (index = 0; len > 0; index++) {
2479                 struct scrub_page *spage;
2480                 u64 l = min_t(u64, len, PAGE_SIZE);
2481
2482                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2483                 if (!spage) {
2484 leave_nomem:
2485                         spin_lock(&sctx->stat_lock);
2486                         sctx->stat.malloc_errors++;
2487                         spin_unlock(&sctx->stat_lock);
2488                         scrub_block_put(sblock);
2489                         return -ENOMEM;
2490                 }
2491                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2492                 /* For scrub block */
2493                 scrub_page_get(spage);
2494                 sblock->pagev[index] = spage;
2495                 /* For scrub parity */
2496                 scrub_page_get(spage);
2497                 list_add_tail(&spage->list, &sparity->spages);
2498                 spage->sblock = sblock;
2499                 spage->dev = dev;
2500                 spage->flags = flags;
2501                 spage->generation = gen;
2502                 spage->logical = logical;
2503                 spage->physical = physical;
2504                 spage->mirror_num = mirror_num;
2505                 if (csum) {
2506                         spage->have_csum = 1;
2507                         memcpy(spage->csum, csum, sctx->csum_size);
2508                 } else {
2509                         spage->have_csum = 0;
2510                 }
2511                 sblock->page_count++;
2512                 spage->page = alloc_page(GFP_NOFS);
2513                 if (!spage->page)
2514                         goto leave_nomem;
2515                 len -= l;
2516                 logical += l;
2517                 physical += l;
2518         }
2519
2520         WARN_ON(sblock->page_count == 0);
2521         for (index = 0; index < sblock->page_count; index++) {
2522                 struct scrub_page *spage = sblock->pagev[index];
2523                 int ret;
2524
2525                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2526                 if (ret) {
2527                         scrub_block_put(sblock);
2528                         return ret;
2529                 }
2530         }
2531
2532         /* last one frees, either here or in bio completion for last page */
2533         scrub_block_put(sblock);
2534         return 0;
2535 }
2536
2537 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2538                                    u64 logical, u64 len,
2539                                    u64 physical, struct btrfs_device *dev,
2540                                    u64 flags, u64 gen, int mirror_num)
2541 {
2542         struct scrub_ctx *sctx = sparity->sctx;
2543         int ret;
2544         u8 csum[BTRFS_CSUM_SIZE];
2545         u32 blocksize;
2546
2547         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2548                 blocksize = sctx->sectorsize;
2549         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2550                 blocksize = sctx->nodesize;
2551         } else {
2552                 blocksize = sctx->sectorsize;
2553                 WARN_ON(1);
2554         }
2555
2556         while (len) {
2557                 u64 l = min_t(u64, len, blocksize);
2558                 int have_csum = 0;
2559
2560                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2561                         /* push csums to sbio */
2562                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2563                         if (have_csum == 0)
2564                                 goto skip;
2565                 }
2566                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2567                                              flags, gen, mirror_num,
2568                                              have_csum ? csum : NULL);
2569                 if (ret)
2570                         return ret;
2571 skip:
2572                 len -= l;
2573                 logical += l;
2574                 physical += l;
2575         }
2576         return 0;
2577 }
2578
2579 /*
2580  * Given a physical address, this will calculate it's
2581  * logical offset. if this is a parity stripe, it will return
2582  * the most left data stripe's logical offset.
2583  *
2584  * return 0 if it is a data stripe, 1 means parity stripe.
2585  */
2586 static int get_raid56_logic_offset(u64 physical, int num,
2587                                    struct map_lookup *map, u64 *offset,
2588                                    u64 *stripe_start)
2589 {
2590         int i;
2591         int j = 0;
2592         u64 stripe_nr;
2593         u64 last_offset;
2594         int stripe_index;
2595         int rot;
2596
2597         last_offset = (physical - map->stripes[num].physical) *
2598                       nr_data_stripes(map);
2599         if (stripe_start)
2600                 *stripe_start = last_offset;
2601
2602         *offset = last_offset;
2603         for (i = 0; i < nr_data_stripes(map); i++) {
2604                 *offset = last_offset + i * map->stripe_len;
2605
2606                 stripe_nr = *offset;
2607                 do_div(stripe_nr, map->stripe_len);
2608                 do_div(stripe_nr, nr_data_stripes(map));
2609
2610                 /* Work out the disk rotation on this stripe-set */
2611                 rot = do_div(stripe_nr, map->num_stripes);
2612                 /* calculate which stripe this data locates */
2613                 rot += i;
2614                 stripe_index = rot % map->num_stripes;
2615                 if (stripe_index == num)
2616                         return 0;
2617                 if (stripe_index < num)
2618                         j++;
2619         }
2620         *offset = last_offset + j * map->stripe_len;
2621         return 1;
2622 }
2623
2624 static void scrub_free_parity(struct scrub_parity *sparity)
2625 {
2626         struct scrub_ctx *sctx = sparity->sctx;
2627         struct scrub_page *curr, *next;
2628         int nbits;
2629
2630         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2631         if (nbits) {
2632                 spin_lock(&sctx->stat_lock);
2633                 sctx->stat.read_errors += nbits;
2634                 sctx->stat.uncorrectable_errors += nbits;
2635                 spin_unlock(&sctx->stat_lock);
2636         }
2637
2638         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2639                 list_del_init(&curr->list);
2640                 scrub_page_put(curr);
2641         }
2642
2643         kfree(sparity);
2644 }
2645
2646 static void scrub_parity_bio_endio(struct bio *bio, int error)
2647 {
2648         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2649         struct scrub_ctx *sctx = sparity->sctx;
2650
2651         if (error)
2652                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2653                           sparity->nsectors);
2654
2655         scrub_free_parity(sparity);
2656         scrub_pending_bio_dec(sctx);
2657         bio_put(bio);
2658 }
2659
2660 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2661 {
2662         struct scrub_ctx *sctx = sparity->sctx;
2663         struct bio *bio;
2664         struct btrfs_raid_bio *rbio;
2665         struct scrub_page *spage;
2666         struct btrfs_bio *bbio = NULL;
2667         u64 length;
2668         int ret;
2669
2670         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2671                            sparity->nsectors))
2672                 goto out;
2673
2674         length = sparity->logic_end - sparity->logic_start + 1;
2675         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2676                                sparity->logic_start,
2677                                &length, &bbio, 0, 1);
2678         if (ret || !bbio || !bbio->raid_map)
2679                 goto bbio_out;
2680
2681         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2682         if (!bio)
2683                 goto bbio_out;
2684
2685         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2686         bio->bi_private = sparity;
2687         bio->bi_end_io = scrub_parity_bio_endio;
2688
2689         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2690                                               length, sparity->scrub_dev,
2691                                               sparity->dbitmap,
2692                                               sparity->nsectors);
2693         if (!rbio)
2694                 goto rbio_out;
2695
2696         list_for_each_entry(spage, &sparity->spages, list)
2697                 raid56_parity_add_scrub_pages(rbio, spage->page,
2698                                               spage->logical);
2699
2700         scrub_pending_bio_inc(sctx);
2701         raid56_parity_submit_scrub_rbio(rbio);
2702         return;
2703
2704 rbio_out:
2705         bio_put(bio);
2706 bbio_out:
2707         btrfs_put_bbio(bbio);
2708         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2709                   sparity->nsectors);
2710         spin_lock(&sctx->stat_lock);
2711         sctx->stat.malloc_errors++;
2712         spin_unlock(&sctx->stat_lock);
2713 out:
2714         scrub_free_parity(sparity);
2715 }
2716
2717 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2718 {
2719         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2720 }
2721
2722 static void scrub_parity_get(struct scrub_parity *sparity)
2723 {
2724         atomic_inc(&sparity->refs);
2725 }
2726
2727 static void scrub_parity_put(struct scrub_parity *sparity)
2728 {
2729         if (!atomic_dec_and_test(&sparity->refs))
2730                 return;
2731
2732         scrub_parity_check_and_repair(sparity);
2733 }
2734
2735 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2736                                                   struct map_lookup *map,
2737                                                   struct btrfs_device *sdev,
2738                                                   struct btrfs_path *path,
2739                                                   u64 logic_start,
2740                                                   u64 logic_end)
2741 {
2742         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2743         struct btrfs_root *root = fs_info->extent_root;
2744         struct btrfs_root *csum_root = fs_info->csum_root;
2745         struct btrfs_extent_item *extent;
2746         u64 flags;
2747         int ret;
2748         int slot;
2749         struct extent_buffer *l;
2750         struct btrfs_key key;
2751         u64 generation;
2752         u64 extent_logical;
2753         u64 extent_physical;
2754         u64 extent_len;
2755         struct btrfs_device *extent_dev;
2756         struct scrub_parity *sparity;
2757         int nsectors;
2758         int bitmap_len;
2759         int extent_mirror_num;
2760         int stop_loop = 0;
2761
2762         nsectors = map->stripe_len / root->sectorsize;
2763         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2764         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2765                           GFP_NOFS);
2766         if (!sparity) {
2767                 spin_lock(&sctx->stat_lock);
2768                 sctx->stat.malloc_errors++;
2769                 spin_unlock(&sctx->stat_lock);
2770                 return -ENOMEM;
2771         }
2772
2773         sparity->stripe_len = map->stripe_len;
2774         sparity->nsectors = nsectors;
2775         sparity->sctx = sctx;
2776         sparity->scrub_dev = sdev;
2777         sparity->logic_start = logic_start;
2778         sparity->logic_end = logic_end;
2779         atomic_set(&sparity->refs, 1);
2780         INIT_LIST_HEAD(&sparity->spages);
2781         sparity->dbitmap = sparity->bitmap;
2782         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2783
2784         ret = 0;
2785         while (logic_start < logic_end) {
2786                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2787                         key.type = BTRFS_METADATA_ITEM_KEY;
2788                 else
2789                         key.type = BTRFS_EXTENT_ITEM_KEY;
2790                 key.objectid = logic_start;
2791                 key.offset = (u64)-1;
2792
2793                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2794                 if (ret < 0)
2795                         goto out;
2796
2797                 if (ret > 0) {
2798                         ret = btrfs_previous_extent_item(root, path, 0);
2799                         if (ret < 0)
2800                                 goto out;
2801                         if (ret > 0) {
2802                                 btrfs_release_path(path);
2803                                 ret = btrfs_search_slot(NULL, root, &key,
2804                                                         path, 0, 0);
2805                                 if (ret < 0)
2806                                         goto out;
2807                         }
2808                 }
2809
2810                 stop_loop = 0;
2811                 while (1) {
2812                         u64 bytes;
2813
2814                         l = path->nodes[0];
2815                         slot = path->slots[0];
2816                         if (slot >= btrfs_header_nritems(l)) {
2817                                 ret = btrfs_next_leaf(root, path);
2818                                 if (ret == 0)
2819                                         continue;
2820                                 if (ret < 0)
2821                                         goto out;
2822
2823                                 stop_loop = 1;
2824                                 break;
2825                         }
2826                         btrfs_item_key_to_cpu(l, &key, slot);
2827
2828                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2829                                 bytes = root->nodesize;
2830                         else
2831                                 bytes = key.offset;
2832
2833                         if (key.objectid + bytes <= logic_start)
2834                                 goto next;
2835
2836                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2837                             key.type != BTRFS_METADATA_ITEM_KEY)
2838                                 goto next;
2839
2840                         if (key.objectid > logic_end) {
2841                                 stop_loop = 1;
2842                                 break;
2843                         }
2844
2845                         while (key.objectid >= logic_start + map->stripe_len)
2846                                 logic_start += map->stripe_len;
2847
2848                         extent = btrfs_item_ptr(l, slot,
2849                                                 struct btrfs_extent_item);
2850                         flags = btrfs_extent_flags(l, extent);
2851                         generation = btrfs_extent_generation(l, extent);
2852
2853                         if (key.objectid < logic_start &&
2854                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2855                                 btrfs_err(fs_info,
2856                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2857                                            key.objectid, logic_start);
2858                                 goto next;
2859                         }
2860 again:
2861                         extent_logical = key.objectid;
2862                         extent_len = bytes;
2863
2864                         if (extent_logical < logic_start) {
2865                                 extent_len -= logic_start - extent_logical;
2866                                 extent_logical = logic_start;
2867                         }
2868
2869                         if (extent_logical + extent_len >
2870                             logic_start + map->stripe_len)
2871                                 extent_len = logic_start + map->stripe_len -
2872                                              extent_logical;
2873
2874                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2875                                                        extent_len);
2876
2877                         scrub_remap_extent(fs_info, extent_logical,
2878                                            extent_len, &extent_physical,
2879                                            &extent_dev,
2880                                            &extent_mirror_num);
2881
2882                         ret = btrfs_lookup_csums_range(csum_root,
2883                                                 extent_logical,
2884                                                 extent_logical + extent_len - 1,
2885                                                 &sctx->csum_list, 1);
2886                         if (ret)
2887                                 goto out;
2888
2889                         ret = scrub_extent_for_parity(sparity, extent_logical,
2890                                                       extent_len,
2891                                                       extent_physical,
2892                                                       extent_dev, flags,
2893                                                       generation,
2894                                                       extent_mirror_num);
2895                         if (ret)
2896                                 goto out;
2897
2898                         scrub_free_csums(sctx);
2899                         if (extent_logical + extent_len <
2900                             key.objectid + bytes) {
2901                                 logic_start += map->stripe_len;
2902
2903                                 if (logic_start >= logic_end) {
2904                                         stop_loop = 1;
2905                                         break;
2906                                 }
2907
2908                                 if (logic_start < key.objectid + bytes) {
2909                                         cond_resched();
2910                                         goto again;
2911                                 }
2912                         }
2913 next:
2914                         path->slots[0]++;
2915                 }
2916
2917                 btrfs_release_path(path);
2918
2919                 if (stop_loop)
2920                         break;
2921
2922                 logic_start += map->stripe_len;
2923         }
2924 out:
2925         if (ret < 0)
2926                 scrub_parity_mark_sectors_error(sparity, logic_start,
2927                                                 logic_end - logic_start + 1);
2928         scrub_parity_put(sparity);
2929         scrub_submit(sctx);
2930         mutex_lock(&sctx->wr_ctx.wr_lock);
2931         scrub_wr_submit(sctx);
2932         mutex_unlock(&sctx->wr_ctx.wr_lock);
2933
2934         btrfs_release_path(path);
2935         return ret < 0 ? ret : 0;
2936 }
2937
2938 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2939                                            struct map_lookup *map,
2940                                            struct btrfs_device *scrub_dev,
2941                                            int num, u64 base, u64 length,
2942                                            int is_dev_replace)
2943 {
2944         struct btrfs_path *path, *ppath;
2945         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2946         struct btrfs_root *root = fs_info->extent_root;
2947         struct btrfs_root *csum_root = fs_info->csum_root;
2948         struct btrfs_extent_item *extent;
2949         struct blk_plug plug;
2950         u64 flags;
2951         int ret;
2952         int slot;
2953         u64 nstripes;
2954         struct extent_buffer *l;
2955         struct btrfs_key key;
2956         u64 physical;
2957         u64 logical;
2958         u64 logic_end;
2959         u64 physical_end;
2960         u64 generation;
2961         int mirror_num;
2962         struct reada_control *reada1;
2963         struct reada_control *reada2;
2964         struct btrfs_key key_start;
2965         struct btrfs_key key_end;
2966         u64 increment = map->stripe_len;
2967         u64 offset;
2968         u64 extent_logical;
2969         u64 extent_physical;
2970         u64 extent_len;
2971         u64 stripe_logical;
2972         u64 stripe_end;
2973         struct btrfs_device *extent_dev;
2974         int extent_mirror_num;
2975         int stop_loop = 0;
2976
2977         nstripes = length;
2978         physical = map->stripes[num].physical;
2979         offset = 0;
2980         do_div(nstripes, map->stripe_len);
2981         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2982                 offset = map->stripe_len * num;
2983                 increment = map->stripe_len * map->num_stripes;
2984                 mirror_num = 1;
2985         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2986                 int factor = map->num_stripes / map->sub_stripes;
2987                 offset = map->stripe_len * (num / map->sub_stripes);
2988                 increment = map->stripe_len * factor;
2989                 mirror_num = num % map->sub_stripes + 1;
2990         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2991                 increment = map->stripe_len;
2992                 mirror_num = num % map->num_stripes + 1;
2993         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2994                 increment = map->stripe_len;
2995                 mirror_num = num % map->num_stripes + 1;
2996         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2997                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
2998                 increment = map->stripe_len * nr_data_stripes(map);
2999                 mirror_num = 1;
3000         } else {
3001                 increment = map->stripe_len;
3002                 mirror_num = 1;
3003         }
3004
3005         path = btrfs_alloc_path();
3006         if (!path)
3007                 return -ENOMEM;
3008
3009         ppath = btrfs_alloc_path();
3010         if (!ppath) {
3011                 btrfs_free_path(ppath);
3012                 return -ENOMEM;
3013         }
3014
3015         /*
3016          * work on commit root. The related disk blocks are static as
3017          * long as COW is applied. This means, it is save to rewrite
3018          * them to repair disk errors without any race conditions
3019          */
3020         path->search_commit_root = 1;
3021         path->skip_locking = 1;
3022
3023         /*
3024          * trigger the readahead for extent tree csum tree and wait for
3025          * completion. During readahead, the scrub is officially paused
3026          * to not hold off transaction commits
3027          */
3028         logical = base + offset;
3029         physical_end = physical + nstripes * map->stripe_len;
3030         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3031                 get_raid56_logic_offset(physical_end, num,
3032                                         map, &logic_end, NULL);
3033                 logic_end += base;
3034         } else {
3035                 logic_end = logical + increment * nstripes;
3036         }
3037         wait_event(sctx->list_wait,
3038                    atomic_read(&sctx->bios_in_flight) == 0);
3039         scrub_blocked_if_needed(fs_info);
3040
3041         /* FIXME it might be better to start readahead at commit root */
3042         key_start.objectid = logical;
3043         key_start.type = BTRFS_EXTENT_ITEM_KEY;
3044         key_start.offset = (u64)0;
3045         key_end.objectid = logic_end;
3046         key_end.type = BTRFS_METADATA_ITEM_KEY;
3047         key_end.offset = (u64)-1;
3048         reada1 = btrfs_reada_add(root, &key_start, &key_end);
3049
3050         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3051         key_start.type = BTRFS_EXTENT_CSUM_KEY;
3052         key_start.offset = logical;
3053         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3054         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3055         key_end.offset = logic_end;
3056         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3057
3058         if (!IS_ERR(reada1))
3059                 btrfs_reada_wait(reada1);
3060         if (!IS_ERR(reada2))
3061                 btrfs_reada_wait(reada2);
3062
3063
3064         /*
3065          * collect all data csums for the stripe to avoid seeking during
3066          * the scrub. This might currently (crc32) end up to be about 1MB
3067          */
3068         blk_start_plug(&plug);
3069
3070         /*
3071          * now find all extents for each stripe and scrub them
3072          */
3073         ret = 0;
3074         while (physical < physical_end) {
3075                 /* for raid56, we skip parity stripe */
3076                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3077                         ret = get_raid56_logic_offset(physical, num,
3078                                         map, &logical, &stripe_logical);
3079                         logical += base;
3080                         if (ret) {
3081                                 stripe_logical += base;
3082                                 stripe_end = stripe_logical + increment - 1;
3083                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3084                                                 ppath, stripe_logical,
3085                                                 stripe_end);
3086                                 if (ret)
3087                                         goto out;
3088                                 goto skip;
3089                         }
3090                 }
3091                 /*
3092                  * canceled?
3093                  */
3094                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3095                     atomic_read(&sctx->cancel_req)) {
3096                         ret = -ECANCELED;
3097                         goto out;
3098                 }
3099                 /*
3100                  * check to see if we have to pause
3101                  */
3102                 if (atomic_read(&fs_info->scrub_pause_req)) {
3103                         /* push queued extents */
3104                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3105                         scrub_submit(sctx);
3106                         mutex_lock(&sctx->wr_ctx.wr_lock);
3107                         scrub_wr_submit(sctx);
3108                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3109                         wait_event(sctx->list_wait,
3110                                    atomic_read(&sctx->bios_in_flight) == 0);
3111                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3112                         scrub_blocked_if_needed(fs_info);
3113                 }
3114
3115                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3116                         key.type = BTRFS_METADATA_ITEM_KEY;
3117                 else
3118                         key.type = BTRFS_EXTENT_ITEM_KEY;
3119                 key.objectid = logical;
3120                 key.offset = (u64)-1;
3121
3122                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3123                 if (ret < 0)
3124                         goto out;
3125
3126                 if (ret > 0) {
3127                         ret = btrfs_previous_extent_item(root, path, 0);
3128                         if (ret < 0)
3129                                 goto out;
3130                         if (ret > 0) {
3131                                 /* there's no smaller item, so stick with the
3132                                  * larger one */
3133                                 btrfs_release_path(path);
3134                                 ret = btrfs_search_slot(NULL, root, &key,
3135                                                         path, 0, 0);
3136                                 if (ret < 0)
3137                                         goto out;
3138                         }
3139                 }
3140
3141                 stop_loop = 0;
3142                 while (1) {
3143                         u64 bytes;
3144
3145                         l = path->nodes[0];
3146                         slot = path->slots[0];
3147                         if (slot >= btrfs_header_nritems(l)) {
3148                                 ret = btrfs_next_leaf(root, path);
3149                                 if (ret == 0)
3150                                         continue;
3151                                 if (ret < 0)
3152                                         goto out;
3153
3154                                 stop_loop = 1;
3155                                 break;
3156                         }
3157                         btrfs_item_key_to_cpu(l, &key, slot);
3158
3159                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3160                                 bytes = root->nodesize;
3161                         else
3162                                 bytes = key.offset;
3163
3164                         if (key.objectid + bytes <= logical)
3165                                 goto next;
3166
3167                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3168                             key.type != BTRFS_METADATA_ITEM_KEY)
3169                                 goto next;
3170
3171                         if (key.objectid >= logical + map->stripe_len) {
3172                                 /* out of this device extent */
3173                                 if (key.objectid >= logic_end)
3174                                         stop_loop = 1;
3175                                 break;
3176                         }
3177
3178                         extent = btrfs_item_ptr(l, slot,
3179                                                 struct btrfs_extent_item);
3180                         flags = btrfs_extent_flags(l, extent);
3181                         generation = btrfs_extent_generation(l, extent);
3182
3183                         if (key.objectid < logical &&
3184                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3185                                 btrfs_err(fs_info,
3186                                            "scrub: tree block %llu spanning "
3187                                            "stripes, ignored. logical=%llu",
3188                                        key.objectid, logical);
3189                                 goto next;
3190                         }
3191
3192 again:
3193                         extent_logical = key.objectid;
3194                         extent_len = bytes;
3195
3196                         /*
3197                          * trim extent to this stripe
3198                          */
3199                         if (extent_logical < logical) {
3200                                 extent_len -= logical - extent_logical;
3201                                 extent_logical = logical;
3202                         }
3203                         if (extent_logical + extent_len >
3204                             logical + map->stripe_len) {
3205                                 extent_len = logical + map->stripe_len -
3206                                              extent_logical;
3207                         }
3208
3209                         extent_physical = extent_logical - logical + physical;
3210                         extent_dev = scrub_dev;
3211                         extent_mirror_num = mirror_num;
3212                         if (is_dev_replace)
3213                                 scrub_remap_extent(fs_info, extent_logical,
3214                                                    extent_len, &extent_physical,
3215                                                    &extent_dev,
3216                                                    &extent_mirror_num);
3217
3218                         ret = btrfs_lookup_csums_range(csum_root, logical,
3219                                                 logical + map->stripe_len - 1,
3220                                                 &sctx->csum_list, 1);
3221                         if (ret)
3222                                 goto out;
3223
3224                         ret = scrub_extent(sctx, extent_logical, extent_len,
3225                                            extent_physical, extent_dev, flags,
3226                                            generation, extent_mirror_num,
3227                                            extent_logical - logical + physical);
3228                         if (ret)
3229                                 goto out;
3230
3231                         scrub_free_csums(sctx);
3232                         if (extent_logical + extent_len <
3233                             key.objectid + bytes) {
3234                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3235                                         /*
3236                                          * loop until we find next data stripe
3237                                          * or we have finished all stripes.
3238                                          */
3239 loop:
3240                                         physical += map->stripe_len;
3241                                         ret = get_raid56_logic_offset(physical,
3242                                                         num, map, &logical,
3243                                                         &stripe_logical);
3244                                         logical += base;
3245
3246                                         if (ret && physical < physical_end) {
3247                                                 stripe_logical += base;
3248                                                 stripe_end = stripe_logical +
3249                                                                 increment - 1;
3250                                                 ret = scrub_raid56_parity(sctx,
3251                                                         map, scrub_dev, ppath,
3252                                                         stripe_logical,
3253                                                         stripe_end);
3254                                                 if (ret)
3255                                                         goto out;
3256                                                 goto loop;
3257                                         }
3258                                 } else {
3259                                         physical += map->stripe_len;
3260                                         logical += increment;
3261                                 }
3262                                 if (logical < key.objectid + bytes) {
3263                                         cond_resched();
3264                                         goto again;
3265                                 }
3266
3267                                 if (physical >= physical_end) {
3268                                         stop_loop = 1;
3269                                         break;
3270                                 }
3271                         }
3272 next:
3273                         path->slots[0]++;
3274                 }
3275                 btrfs_release_path(path);
3276 skip:
3277                 logical += increment;
3278                 physical += map->stripe_len;
3279                 spin_lock(&sctx->stat_lock);
3280                 if (stop_loop)
3281                         sctx->stat.last_physical = map->stripes[num].physical +
3282                                                    length;
3283                 else
3284                         sctx->stat.last_physical = physical;
3285                 spin_unlock(&sctx->stat_lock);
3286                 if (stop_loop)
3287                         break;
3288         }
3289 out:
3290         /* push queued extents */
3291         scrub_submit(sctx);
3292         mutex_lock(&sctx->wr_ctx.wr_lock);
3293         scrub_wr_submit(sctx);
3294         mutex_unlock(&sctx->wr_ctx.wr_lock);
3295
3296         blk_finish_plug(&plug);
3297         btrfs_free_path(path);
3298         btrfs_free_path(ppath);
3299         return ret < 0 ? ret : 0;
3300 }
3301
3302 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3303                                           struct btrfs_device *scrub_dev,
3304                                           u64 chunk_tree, u64 chunk_objectid,
3305                                           u64 chunk_offset, u64 length,
3306                                           u64 dev_offset, int is_dev_replace)
3307 {
3308         struct btrfs_mapping_tree *map_tree =
3309                 &sctx->dev_root->fs_info->mapping_tree;
3310         struct map_lookup *map;
3311         struct extent_map *em;
3312         int i;
3313         int ret = 0;
3314
3315         read_lock(&map_tree->map_tree.lock);
3316         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3317         read_unlock(&map_tree->map_tree.lock);
3318
3319         if (!em)
3320                 return -EINVAL;
3321
3322         map = (struct map_lookup *)em->bdev;
3323         if (em->start != chunk_offset)
3324                 goto out;
3325
3326         if (em->len < length)
3327                 goto out;
3328
3329         for (i = 0; i < map->num_stripes; ++i) {
3330                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3331                     map->stripes[i].physical == dev_offset) {
3332                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3333                                            chunk_offset, length,
3334                                            is_dev_replace);
3335                         if (ret)
3336                                 goto out;
3337                 }
3338         }
3339 out:
3340         free_extent_map(em);
3341
3342         return ret;
3343 }
3344
3345 static noinline_for_stack
3346 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3347                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3348                            int is_dev_replace)
3349 {
3350         struct btrfs_dev_extent *dev_extent = NULL;
3351         struct btrfs_path *path;
3352         struct btrfs_root *root = sctx->dev_root;
3353         struct btrfs_fs_info *fs_info = root->fs_info;
3354         u64 length;
3355         u64 chunk_tree;
3356         u64 chunk_objectid;
3357         u64 chunk_offset;
3358         int ret;
3359         int slot;
3360         struct extent_buffer *l;
3361         struct btrfs_key key;
3362         struct btrfs_key found_key;
3363         struct btrfs_block_group_cache *cache;
3364         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3365
3366         path = btrfs_alloc_path();
3367         if (!path)
3368                 return -ENOMEM;
3369
3370         path->reada = 2;
3371         path->search_commit_root = 1;
3372         path->skip_locking = 1;
3373
3374         key.objectid = scrub_dev->devid;
3375         key.offset = 0ull;
3376         key.type = BTRFS_DEV_EXTENT_KEY;
3377
3378         while (1) {
3379                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3380                 if (ret < 0)
3381                         break;
3382                 if (ret > 0) {
3383                         if (path->slots[0] >=
3384                             btrfs_header_nritems(path->nodes[0])) {
3385                                 ret = btrfs_next_leaf(root, path);
3386                                 if (ret)
3387                                         break;
3388                         }
3389                 }
3390
3391                 l = path->nodes[0];
3392                 slot = path->slots[0];
3393
3394                 btrfs_item_key_to_cpu(l, &found_key, slot);
3395
3396                 if (found_key.objectid != scrub_dev->devid)
3397                         break;
3398
3399                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3400                         break;
3401
3402                 if (found_key.offset >= end)
3403                         break;
3404
3405                 if (found_key.offset < key.offset)
3406                         break;
3407
3408                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3409                 length = btrfs_dev_extent_length(l, dev_extent);
3410
3411                 if (found_key.offset + length <= start)
3412                         goto skip;
3413
3414                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3415                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3416                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3417
3418                 /*
3419                  * get a reference on the corresponding block group to prevent
3420                  * the chunk from going away while we scrub it
3421                  */
3422                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3423
3424                 /* some chunks are removed but not committed to disk yet,
3425                  * continue scrubbing */
3426                 if (!cache)
3427                         goto skip;
3428
3429                 dev_replace->cursor_right = found_key.offset + length;
3430                 dev_replace->cursor_left = found_key.offset;
3431                 dev_replace->item_needs_writeback = 1;
3432                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3433                                   chunk_offset, length, found_key.offset,
3434                                   is_dev_replace);
3435
3436                 /*
3437                  * flush, submit all pending read and write bios, afterwards
3438                  * wait for them.
3439                  * Note that in the dev replace case, a read request causes
3440                  * write requests that are submitted in the read completion
3441                  * worker. Therefore in the current situation, it is required
3442                  * that all write requests are flushed, so that all read and
3443                  * write requests are really completed when bios_in_flight
3444                  * changes to 0.
3445                  */
3446                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3447                 scrub_submit(sctx);
3448                 mutex_lock(&sctx->wr_ctx.wr_lock);
3449                 scrub_wr_submit(sctx);
3450                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3451
3452                 wait_event(sctx->list_wait,
3453                            atomic_read(&sctx->bios_in_flight) == 0);
3454                 atomic_inc(&fs_info->scrubs_paused);
3455                 wake_up(&fs_info->scrub_pause_wait);
3456
3457                 /*
3458                  * must be called before we decrease @scrub_paused.
3459                  * make sure we don't block transaction commit while
3460                  * we are waiting pending workers finished.
3461                  */
3462                 wait_event(sctx->list_wait,
3463                            atomic_read(&sctx->workers_pending) == 0);
3464                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3465
3466                 mutex_lock(&fs_info->scrub_lock);
3467                 __scrub_blocked_if_needed(fs_info);
3468                 atomic_dec(&fs_info->scrubs_paused);
3469                 mutex_unlock(&fs_info->scrub_lock);
3470                 wake_up(&fs_info->scrub_pause_wait);
3471
3472                 btrfs_put_block_group(cache);
3473                 if (ret)
3474                         break;
3475                 if (is_dev_replace &&
3476                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3477                         ret = -EIO;
3478                         break;
3479                 }
3480                 if (sctx->stat.malloc_errors > 0) {
3481                         ret = -ENOMEM;
3482                         break;
3483                 }
3484
3485                 dev_replace->cursor_left = dev_replace->cursor_right;
3486                 dev_replace->item_needs_writeback = 1;
3487 skip:
3488                 key.offset = found_key.offset + length;
3489                 btrfs_release_path(path);
3490         }
3491
3492         btrfs_free_path(path);
3493
3494         /*
3495          * ret can still be 1 from search_slot or next_leaf,
3496          * that's not an error
3497          */
3498         return ret < 0 ? ret : 0;
3499 }
3500
3501 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3502                                            struct btrfs_device *scrub_dev)
3503 {
3504         int     i;
3505         u64     bytenr;
3506         u64     gen;
3507         int     ret;
3508         struct btrfs_root *root = sctx->dev_root;
3509
3510         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3511                 return -EIO;
3512
3513         /* Seed devices of a new filesystem has their own generation. */
3514         if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3515                 gen = scrub_dev->generation;
3516         else
3517                 gen = root->fs_info->last_trans_committed;
3518
3519         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3520                 bytenr = btrfs_sb_offset(i);
3521                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3522                     scrub_dev->commit_total_bytes)
3523                         break;
3524
3525                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3526                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3527                                   NULL, 1, bytenr);
3528                 if (ret)
3529                         return ret;
3530         }
3531         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3532
3533         return 0;
3534 }
3535
3536 /*
3537  * get a reference count on fs_info->scrub_workers. start worker if necessary
3538  */
3539 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3540                                                 int is_dev_replace)
3541 {
3542         int ret = 0;
3543         int flags = WQ_FREEZABLE | WQ_UNBOUND;
3544         int max_active = fs_info->thread_pool_size;
3545
3546         if (fs_info->scrub_workers_refcnt == 0) {
3547                 if (is_dev_replace)
3548                         fs_info->scrub_workers =
3549                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3550                                                       1, 4);
3551                 else
3552                         fs_info->scrub_workers =
3553                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3554                                                       max_active, 4);
3555                 if (!fs_info->scrub_workers) {
3556                         ret = -ENOMEM;
3557                         goto out;
3558                 }
3559                 fs_info->scrub_wr_completion_workers =
3560                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3561                                               max_active, 2);
3562                 if (!fs_info->scrub_wr_completion_workers) {
3563                         ret = -ENOMEM;
3564                         goto out;
3565                 }
3566                 fs_info->scrub_nocow_workers =
3567                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3568                 if (!fs_info->scrub_nocow_workers) {
3569                         ret = -ENOMEM;
3570                         goto out;
3571                 }
3572         }
3573         ++fs_info->scrub_workers_refcnt;
3574 out:
3575         return ret;
3576 }
3577
3578 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3579 {
3580         if (--fs_info->scrub_workers_refcnt == 0) {
3581                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3582                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3583                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3584         }
3585         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3586 }
3587
3588 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3589                     u64 end, struct btrfs_scrub_progress *progress,
3590                     int readonly, int is_dev_replace)
3591 {
3592         struct scrub_ctx *sctx;
3593         int ret;
3594         struct btrfs_device *dev;
3595         struct rcu_string *name;
3596
3597         if (btrfs_fs_closing(fs_info))
3598                 return -EINVAL;
3599
3600         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3601                 /*
3602                  * in this case scrub is unable to calculate the checksum
3603                  * the way scrub is implemented. Do not handle this
3604                  * situation at all because it won't ever happen.
3605                  */
3606                 btrfs_err(fs_info,
3607                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3608                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3609                 return -EINVAL;
3610         }
3611
3612         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3613                 /* not supported for data w/o checksums */
3614                 btrfs_err(fs_info,
3615                            "scrub: size assumption sectorsize != PAGE_SIZE "
3616                            "(%d != %lu) fails",
3617                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
3618                 return -EINVAL;
3619         }
3620
3621         if (fs_info->chunk_root->nodesize >
3622             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3623             fs_info->chunk_root->sectorsize >
3624             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3625                 /*
3626                  * would exhaust the array bounds of pagev member in
3627                  * struct scrub_block
3628                  */
3629                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3630                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3631                        fs_info->chunk_root->nodesize,
3632                        SCRUB_MAX_PAGES_PER_BLOCK,
3633                        fs_info->chunk_root->sectorsize,
3634                        SCRUB_MAX_PAGES_PER_BLOCK);
3635                 return -EINVAL;
3636         }
3637
3638
3639         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3640         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3641         if (!dev || (dev->missing && !is_dev_replace)) {
3642                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3643                 return -ENODEV;
3644         }
3645
3646         if (!is_dev_replace && !readonly && !dev->writeable) {
3647                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3648                 rcu_read_lock();
3649                 name = rcu_dereference(dev->name);
3650                 btrfs_err(fs_info, "scrub: device %s is not writable",
3651                           name->str);
3652                 rcu_read_unlock();
3653                 return -EROFS;
3654         }
3655
3656         mutex_lock(&fs_info->scrub_lock);
3657         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3658                 mutex_unlock(&fs_info->scrub_lock);
3659                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3660                 return -EIO;
3661         }
3662
3663         btrfs_dev_replace_lock(&fs_info->dev_replace);
3664         if (dev->scrub_device ||
3665             (!is_dev_replace &&
3666              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3667                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
3668                 mutex_unlock(&fs_info->scrub_lock);
3669                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3670                 return -EINPROGRESS;
3671         }
3672         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3673
3674         ret = scrub_workers_get(fs_info, is_dev_replace);
3675         if (ret) {
3676                 mutex_unlock(&fs_info->scrub_lock);
3677                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3678                 return ret;
3679         }
3680
3681         sctx = scrub_setup_ctx(dev, is_dev_replace);
3682         if (IS_ERR(sctx)) {
3683                 mutex_unlock(&fs_info->scrub_lock);
3684                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3685                 scrub_workers_put(fs_info);
3686                 return PTR_ERR(sctx);
3687         }
3688         sctx->readonly = readonly;
3689         dev->scrub_device = sctx;
3690         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3691
3692         /*
3693          * checking @scrub_pause_req here, we can avoid
3694          * race between committing transaction and scrubbing.
3695          */
3696         __scrub_blocked_if_needed(fs_info);
3697         atomic_inc(&fs_info->scrubs_running);
3698         mutex_unlock(&fs_info->scrub_lock);
3699
3700         if (!is_dev_replace) {
3701                 /*
3702                  * by holding device list mutex, we can
3703                  * kick off writing super in log tree sync.
3704                  */
3705                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3706                 ret = scrub_supers(sctx, dev);
3707                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3708         }
3709
3710         if (!ret)
3711                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3712                                              is_dev_replace);
3713
3714         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3715         atomic_dec(&fs_info->scrubs_running);
3716         wake_up(&fs_info->scrub_pause_wait);
3717
3718         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3719
3720         if (progress)
3721                 memcpy(progress, &sctx->stat, sizeof(*progress));
3722
3723         mutex_lock(&fs_info->scrub_lock);
3724         dev->scrub_device = NULL;
3725         scrub_workers_put(fs_info);
3726         mutex_unlock(&fs_info->scrub_lock);
3727
3728         scrub_free_ctx(sctx);
3729
3730         return ret;
3731 }
3732
3733 void btrfs_scrub_pause(struct btrfs_root *root)
3734 {
3735         struct btrfs_fs_info *fs_info = root->fs_info;
3736
3737         mutex_lock(&fs_info->scrub_lock);
3738         atomic_inc(&fs_info->scrub_pause_req);
3739         while (atomic_read(&fs_info->scrubs_paused) !=
3740                atomic_read(&fs_info->scrubs_running)) {
3741                 mutex_unlock(&fs_info->scrub_lock);
3742                 wait_event(fs_info->scrub_pause_wait,
3743                            atomic_read(&fs_info->scrubs_paused) ==
3744                            atomic_read(&fs_info->scrubs_running));
3745                 mutex_lock(&fs_info->scrub_lock);
3746         }
3747         mutex_unlock(&fs_info->scrub_lock);
3748 }
3749
3750 void btrfs_scrub_continue(struct btrfs_root *root)
3751 {
3752         struct btrfs_fs_info *fs_info = root->fs_info;
3753
3754         atomic_dec(&fs_info->scrub_pause_req);
3755         wake_up(&fs_info->scrub_pause_wait);
3756 }
3757
3758 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3759 {
3760         mutex_lock(&fs_info->scrub_lock);
3761         if (!atomic_read(&fs_info->scrubs_running)) {
3762                 mutex_unlock(&fs_info->scrub_lock);
3763                 return -ENOTCONN;
3764         }
3765
3766         atomic_inc(&fs_info->scrub_cancel_req);
3767         while (atomic_read(&fs_info->scrubs_running)) {
3768                 mutex_unlock(&fs_info->scrub_lock);
3769               &nb