do d_instantiate/unlock_new_inode combinations safely
[muen/linux.git] / fs / btrfs / inode.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/kernel.h>
7 #include <linux/bio.h>
8 #include <linux/buffer_head.h>
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/mpage.h>
18 #include <linux/swap.h>
19 #include <linux/writeback.h>
20 #include <linux/compat.h>
21 #include <linux/bit_spinlock.h>
22 #include <linux/xattr.h>
23 #include <linux/posix_acl.h>
24 #include <linux/falloc.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/mount.h>
28 #include <linux/btrfs.h>
29 #include <linux/blkdev.h>
30 #include <linux/posix_acl_xattr.h>
31 #include <linux/uio.h>
32 #include <linux/magic.h>
33 #include <linux/iversion.h>
34 #include "ctree.h"
35 #include "disk-io.h"
36 #include "transaction.h"
37 #include "btrfs_inode.h"
38 #include "print-tree.h"
39 #include "ordered-data.h"
40 #include "xattr.h"
41 #include "tree-log.h"
42 #include "volumes.h"
43 #include "compression.h"
44 #include "locking.h"
45 #include "free-space-cache.h"
46 #include "inode-map.h"
47 #include "backref.h"
48 #include "props.h"
49 #include "qgroup.h"
50 #include "dedupe.h"
51
52 struct btrfs_iget_args {
53         struct btrfs_key *location;
54         struct btrfs_root *root;
55 };
56
57 struct btrfs_dio_data {
58         u64 reserve;
59         u64 unsubmitted_oe_range_start;
60         u64 unsubmitted_oe_range_end;
61         int overwrite;
62 };
63
64 static const struct inode_operations btrfs_dir_inode_operations;
65 static const struct inode_operations btrfs_symlink_inode_operations;
66 static const struct inode_operations btrfs_dir_ro_inode_operations;
67 static const struct inode_operations btrfs_special_inode_operations;
68 static const struct inode_operations btrfs_file_inode_operations;
69 static const struct address_space_operations btrfs_aops;
70 static const struct address_space_operations btrfs_symlink_aops;
71 static const struct file_operations btrfs_dir_file_operations;
72 static const struct extent_io_ops btrfs_extent_io_ops;
73
74 static struct kmem_cache *btrfs_inode_cachep;
75 struct kmem_cache *btrfs_trans_handle_cachep;
76 struct kmem_cache *btrfs_path_cachep;
77 struct kmem_cache *btrfs_free_space_cachep;
78
79 #define S_SHIFT 12
80 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
81         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
82         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
83         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
84         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
85         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
86         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
87         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
88 };
89
90 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
91 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93 static noinline int cow_file_range(struct inode *inode,
94                                    struct page *locked_page,
95                                    u64 start, u64 end, u64 delalloc_end,
96                                    int *page_started, unsigned long *nr_written,
97                                    int unlock, struct btrfs_dedupe_hash *hash);
98 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
99                                        u64 orig_start, u64 block_start,
100                                        u64 block_len, u64 orig_block_len,
101                                        u64 ram_bytes, int compress_type,
102                                        int type);
103
104 static void __endio_write_update_ordered(struct inode *inode,
105                                          const u64 offset, const u64 bytes,
106                                          const bool uptodate);
107
108 /*
109  * Cleanup all submitted ordered extents in specified range to handle errors
110  * from the fill_dellaloc() callback.
111  *
112  * NOTE: caller must ensure that when an error happens, it can not call
113  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
114  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
115  * to be released, which we want to happen only when finishing the ordered
116  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
117  * fill_delalloc() callback already does proper cleanup for the first page of
118  * the range, that is, it invokes the callback writepage_end_io_hook() for the
119  * range of the first page.
120  */
121 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
122                                                  const u64 offset,
123                                                  const u64 bytes)
124 {
125         unsigned long index = offset >> PAGE_SHIFT;
126         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
127         struct page *page;
128
129         while (index <= end_index) {
130                 page = find_get_page(inode->i_mapping, index);
131                 index++;
132                 if (!page)
133                         continue;
134                 ClearPagePrivate2(page);
135                 put_page(page);
136         }
137         return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
138                                             bytes - PAGE_SIZE, false);
139 }
140
141 static int btrfs_dirty_inode(struct inode *inode);
142
143 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
144 void btrfs_test_inode_set_ops(struct inode *inode)
145 {
146         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
147 }
148 #endif
149
150 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
151                                      struct inode *inode,  struct inode *dir,
152                                      const struct qstr *qstr)
153 {
154         int err;
155
156         err = btrfs_init_acl(trans, inode, dir);
157         if (!err)
158                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
159         return err;
160 }
161
162 /*
163  * this does all the hard work for inserting an inline extent into
164  * the btree.  The caller should have done a btrfs_drop_extents so that
165  * no overlapping inline items exist in the btree
166  */
167 static int insert_inline_extent(struct btrfs_trans_handle *trans,
168                                 struct btrfs_path *path, int extent_inserted,
169                                 struct btrfs_root *root, struct inode *inode,
170                                 u64 start, size_t size, size_t compressed_size,
171                                 int compress_type,
172                                 struct page **compressed_pages)
173 {
174         struct extent_buffer *leaf;
175         struct page *page = NULL;
176         char *kaddr;
177         unsigned long ptr;
178         struct btrfs_file_extent_item *ei;
179         int ret;
180         size_t cur_size = size;
181         unsigned long offset;
182
183         if (compressed_size && compressed_pages)
184                 cur_size = compressed_size;
185
186         inode_add_bytes(inode, size);
187
188         if (!extent_inserted) {
189                 struct btrfs_key key;
190                 size_t datasize;
191
192                 key.objectid = btrfs_ino(BTRFS_I(inode));
193                 key.offset = start;
194                 key.type = BTRFS_EXTENT_DATA_KEY;
195
196                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
197                 path->leave_spinning = 1;
198                 ret = btrfs_insert_empty_item(trans, root, path, &key,
199                                               datasize);
200                 if (ret)
201                         goto fail;
202         }
203         leaf = path->nodes[0];
204         ei = btrfs_item_ptr(leaf, path->slots[0],
205                             struct btrfs_file_extent_item);
206         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
207         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
208         btrfs_set_file_extent_encryption(leaf, ei, 0);
209         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
210         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
211         ptr = btrfs_file_extent_inline_start(ei);
212
213         if (compress_type != BTRFS_COMPRESS_NONE) {
214                 struct page *cpage;
215                 int i = 0;
216                 while (compressed_size > 0) {
217                         cpage = compressed_pages[i];
218                         cur_size = min_t(unsigned long, compressed_size,
219                                        PAGE_SIZE);
220
221                         kaddr = kmap_atomic(cpage);
222                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
223                         kunmap_atomic(kaddr);
224
225                         i++;
226                         ptr += cur_size;
227                         compressed_size -= cur_size;
228                 }
229                 btrfs_set_file_extent_compression(leaf, ei,
230                                                   compress_type);
231         } else {
232                 page = find_get_page(inode->i_mapping,
233                                      start >> PAGE_SHIFT);
234                 btrfs_set_file_extent_compression(leaf, ei, 0);
235                 kaddr = kmap_atomic(page);
236                 offset = start & (PAGE_SIZE - 1);
237                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
238                 kunmap_atomic(kaddr);
239                 put_page(page);
240         }
241         btrfs_mark_buffer_dirty(leaf);
242         btrfs_release_path(path);
243
244         /*
245          * we're an inline extent, so nobody can
246          * extend the file past i_size without locking
247          * a page we already have locked.
248          *
249          * We must do any isize and inode updates
250          * before we unlock the pages.  Otherwise we
251          * could end up racing with unlink.
252          */
253         BTRFS_I(inode)->disk_i_size = inode->i_size;
254         ret = btrfs_update_inode(trans, root, inode);
255
256 fail:
257         return ret;
258 }
259
260
261 /*
262  * conditionally insert an inline extent into the file.  This
263  * does the checks required to make sure the data is small enough
264  * to fit as an inline extent.
265  */
266 static noinline int cow_file_range_inline(struct inode *inode, u64 start,
267                                           u64 end, size_t compressed_size,
268                                           int compress_type,
269                                           struct page **compressed_pages)
270 {
271         struct btrfs_root *root = BTRFS_I(inode)->root;
272         struct btrfs_fs_info *fs_info = root->fs_info;
273         struct btrfs_trans_handle *trans;
274         u64 isize = i_size_read(inode);
275         u64 actual_end = min(end + 1, isize);
276         u64 inline_len = actual_end - start;
277         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
278         u64 data_len = inline_len;
279         int ret;
280         struct btrfs_path *path;
281         int extent_inserted = 0;
282         u32 extent_item_size;
283
284         if (compressed_size)
285                 data_len = compressed_size;
286
287         if (start > 0 ||
288             actual_end > fs_info->sectorsize ||
289             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
290             (!compressed_size &&
291             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
292             end + 1 < isize ||
293             data_len > fs_info->max_inline) {
294                 return 1;
295         }
296
297         path = btrfs_alloc_path();
298         if (!path)
299                 return -ENOMEM;
300
301         trans = btrfs_join_transaction(root);
302         if (IS_ERR(trans)) {
303                 btrfs_free_path(path);
304                 return PTR_ERR(trans);
305         }
306         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
307
308         if (compressed_size && compressed_pages)
309                 extent_item_size = btrfs_file_extent_calc_inline_size(
310                    compressed_size);
311         else
312                 extent_item_size = btrfs_file_extent_calc_inline_size(
313                     inline_len);
314
315         ret = __btrfs_drop_extents(trans, root, inode, path,
316                                    start, aligned_end, NULL,
317                                    1, 1, extent_item_size, &extent_inserted);
318         if (ret) {
319                 btrfs_abort_transaction(trans, ret);
320                 goto out;
321         }
322
323         if (isize > actual_end)
324                 inline_len = min_t(u64, isize, actual_end);
325         ret = insert_inline_extent(trans, path, extent_inserted,
326                                    root, inode, start,
327                                    inline_len, compressed_size,
328                                    compress_type, compressed_pages);
329         if (ret && ret != -ENOSPC) {
330                 btrfs_abort_transaction(trans, ret);
331                 goto out;
332         } else if (ret == -ENOSPC) {
333                 ret = 1;
334                 goto out;
335         }
336
337         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
338         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
339 out:
340         /*
341          * Don't forget to free the reserved space, as for inlined extent
342          * it won't count as data extent, free them directly here.
343          * And at reserve time, it's always aligned to page size, so
344          * just free one page here.
345          */
346         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
347         btrfs_free_path(path);
348         btrfs_end_transaction(trans);
349         return ret;
350 }
351
352 struct async_extent {
353         u64 start;
354         u64 ram_size;
355         u64 compressed_size;
356         struct page **pages;
357         unsigned long nr_pages;
358         int compress_type;
359         struct list_head list;
360 };
361
362 struct async_cow {
363         struct inode *inode;
364         struct btrfs_root *root;
365         struct page *locked_page;
366         u64 start;
367         u64 end;
368         unsigned int write_flags;
369         struct list_head extents;
370         struct btrfs_work work;
371 };
372
373 static noinline int add_async_extent(struct async_cow *cow,
374                                      u64 start, u64 ram_size,
375                                      u64 compressed_size,
376                                      struct page **pages,
377                                      unsigned long nr_pages,
378                                      int compress_type)
379 {
380         struct async_extent *async_extent;
381
382         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
383         BUG_ON(!async_extent); /* -ENOMEM */
384         async_extent->start = start;
385         async_extent->ram_size = ram_size;
386         async_extent->compressed_size = compressed_size;
387         async_extent->pages = pages;
388         async_extent->nr_pages = nr_pages;
389         async_extent->compress_type = compress_type;
390         list_add_tail(&async_extent->list, &cow->extents);
391         return 0;
392 }
393
394 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
395 {
396         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
397
398         /* force compress */
399         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
400                 return 1;
401         /* defrag ioctl */
402         if (BTRFS_I(inode)->defrag_compress)
403                 return 1;
404         /* bad compression ratios */
405         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
406                 return 0;
407         if (btrfs_test_opt(fs_info, COMPRESS) ||
408             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
409             BTRFS_I(inode)->prop_compress)
410                 return btrfs_compress_heuristic(inode, start, end);
411         return 0;
412 }
413
414 static inline void inode_should_defrag(struct btrfs_inode *inode,
415                 u64 start, u64 end, u64 num_bytes, u64 small_write)
416 {
417         /* If this is a small write inside eof, kick off a defrag */
418         if (num_bytes < small_write &&
419             (start > 0 || end + 1 < inode->disk_i_size))
420                 btrfs_add_inode_defrag(NULL, inode);
421 }
422
423 /*
424  * we create compressed extents in two phases.  The first
425  * phase compresses a range of pages that have already been
426  * locked (both pages and state bits are locked).
427  *
428  * This is done inside an ordered work queue, and the compression
429  * is spread across many cpus.  The actual IO submission is step
430  * two, and the ordered work queue takes care of making sure that
431  * happens in the same order things were put onto the queue by
432  * writepages and friends.
433  *
434  * If this code finds it can't get good compression, it puts an
435  * entry onto the work queue to write the uncompressed bytes.  This
436  * makes sure that both compressed inodes and uncompressed inodes
437  * are written in the same order that the flusher thread sent them
438  * down.
439  */
440 static noinline void compress_file_range(struct inode *inode,
441                                         struct page *locked_page,
442                                         u64 start, u64 end,
443                                         struct async_cow *async_cow,
444                                         int *num_added)
445 {
446         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
447         u64 blocksize = fs_info->sectorsize;
448         u64 actual_end;
449         u64 isize = i_size_read(inode);
450         int ret = 0;
451         struct page **pages = NULL;
452         unsigned long nr_pages;
453         unsigned long total_compressed = 0;
454         unsigned long total_in = 0;
455         int i;
456         int will_compress;
457         int compress_type = fs_info->compress_type;
458         int redirty = 0;
459
460         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
461                         SZ_16K);
462
463         actual_end = min_t(u64, isize, end + 1);
464 again:
465         will_compress = 0;
466         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
467         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
468         nr_pages = min_t(unsigned long, nr_pages,
469                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
470
471         /*
472          * we don't want to send crud past the end of i_size through
473          * compression, that's just a waste of CPU time.  So, if the
474          * end of the file is before the start of our current
475          * requested range of bytes, we bail out to the uncompressed
476          * cleanup code that can deal with all of this.
477          *
478          * It isn't really the fastest way to fix things, but this is a
479          * very uncommon corner.
480          */
481         if (actual_end <= start)
482                 goto cleanup_and_bail_uncompressed;
483
484         total_compressed = actual_end - start;
485
486         /*
487          * skip compression for a small file range(<=blocksize) that
488          * isn't an inline extent, since it doesn't save disk space at all.
489          */
490         if (total_compressed <= blocksize &&
491            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
492                 goto cleanup_and_bail_uncompressed;
493
494         total_compressed = min_t(unsigned long, total_compressed,
495                         BTRFS_MAX_UNCOMPRESSED);
496         total_in = 0;
497         ret = 0;
498
499         /*
500          * we do compression for mount -o compress and when the
501          * inode has not been flagged as nocompress.  This flag can
502          * change at any time if we discover bad compression ratios.
503          */
504         if (inode_need_compress(inode, start, end)) {
505                 WARN_ON(pages);
506                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
507                 if (!pages) {
508                         /* just bail out to the uncompressed code */
509                         goto cont;
510                 }
511
512                 if (BTRFS_I(inode)->defrag_compress)
513                         compress_type = BTRFS_I(inode)->defrag_compress;
514                 else if (BTRFS_I(inode)->prop_compress)
515                         compress_type = BTRFS_I(inode)->prop_compress;
516
517                 /*
518                  * we need to call clear_page_dirty_for_io on each
519                  * page in the range.  Otherwise applications with the file
520                  * mmap'd can wander in and change the page contents while
521                  * we are compressing them.
522                  *
523                  * If the compression fails for any reason, we set the pages
524                  * dirty again later on.
525                  *
526                  * Note that the remaining part is redirtied, the start pointer
527                  * has moved, the end is the original one.
528                  */
529                 if (!redirty) {
530                         extent_range_clear_dirty_for_io(inode, start, end);
531                         redirty = 1;
532                 }
533
534                 /* Compression level is applied here and only here */
535                 ret = btrfs_compress_pages(
536                         compress_type | (fs_info->compress_level << 4),
537                                            inode->i_mapping, start,
538                                            pages,
539                                            &nr_pages,
540                                            &total_in,
541                                            &total_compressed);
542
543                 if (!ret) {
544                         unsigned long offset = total_compressed &
545                                 (PAGE_SIZE - 1);
546                         struct page *page = pages[nr_pages - 1];
547                         char *kaddr;
548
549                         /* zero the tail end of the last page, we might be
550                          * sending it down to disk
551                          */
552                         if (offset) {
553                                 kaddr = kmap_atomic(page);
554                                 memset(kaddr + offset, 0,
555                                        PAGE_SIZE - offset);
556                                 kunmap_atomic(kaddr);
557                         }
558                         will_compress = 1;
559                 }
560         }
561 cont:
562         if (start == 0) {
563                 /* lets try to make an inline extent */
564                 if (ret || total_in < actual_end) {
565                         /* we didn't compress the entire range, try
566                          * to make an uncompressed inline extent.
567                          */
568                         ret = cow_file_range_inline(inode, start, end, 0,
569                                                     BTRFS_COMPRESS_NONE, NULL);
570                 } else {
571                         /* try making a compressed inline extent */
572                         ret = cow_file_range_inline(inode, start, end,
573                                                     total_compressed,
574                                                     compress_type, pages);
575                 }
576                 if (ret <= 0) {
577                         unsigned long clear_flags = EXTENT_DELALLOC |
578                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
579                                 EXTENT_DO_ACCOUNTING;
580                         unsigned long page_error_op;
581
582                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
583
584                         /*
585                          * inline extent creation worked or returned error,
586                          * we don't need to create any more async work items.
587                          * Unlock and free up our temp pages.
588                          *
589                          * We use DO_ACCOUNTING here because we need the
590                          * delalloc_release_metadata to be done _after_ we drop
591                          * our outstanding extent for clearing delalloc for this
592                          * range.
593                          */
594                         extent_clear_unlock_delalloc(inode, start, end, end,
595                                                      NULL, clear_flags,
596                                                      PAGE_UNLOCK |
597                                                      PAGE_CLEAR_DIRTY |
598                                                      PAGE_SET_WRITEBACK |
599                                                      page_error_op |
600                                                      PAGE_END_WRITEBACK);
601                         goto free_pages_out;
602                 }
603         }
604
605         if (will_compress) {
606                 /*
607                  * we aren't doing an inline extent round the compressed size
608                  * up to a block size boundary so the allocator does sane
609                  * things
610                  */
611                 total_compressed = ALIGN(total_compressed, blocksize);
612
613                 /*
614                  * one last check to make sure the compression is really a
615                  * win, compare the page count read with the blocks on disk,
616                  * compression must free at least one sector size
617                  */
618                 total_in = ALIGN(total_in, PAGE_SIZE);
619                 if (total_compressed + blocksize <= total_in) {
620                         *num_added += 1;
621
622                         /*
623                          * The async work queues will take care of doing actual
624                          * allocation on disk for these compressed pages, and
625                          * will submit them to the elevator.
626                          */
627                         add_async_extent(async_cow, start, total_in,
628                                         total_compressed, pages, nr_pages,
629                                         compress_type);
630
631                         if (start + total_in < end) {
632                                 start += total_in;
633                                 pages = NULL;
634                                 cond_resched();
635                                 goto again;
636                         }
637                         return;
638                 }
639         }
640         if (pages) {
641                 /*
642                  * the compression code ran but failed to make things smaller,
643                  * free any pages it allocated and our page pointer array
644                  */
645                 for (i = 0; i < nr_pages; i++) {
646                         WARN_ON(pages[i]->mapping);
647                         put_page(pages[i]);
648                 }
649                 kfree(pages);
650                 pages = NULL;
651                 total_compressed = 0;
652                 nr_pages = 0;
653
654                 /* flag the file so we don't compress in the future */
655                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
656                     !(BTRFS_I(inode)->prop_compress)) {
657                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
658                 }
659         }
660 cleanup_and_bail_uncompressed:
661         /*
662          * No compression, but we still need to write the pages in the file
663          * we've been given so far.  redirty the locked page if it corresponds
664          * to our extent and set things up for the async work queue to run
665          * cow_file_range to do the normal delalloc dance.
666          */
667         if (page_offset(locked_page) >= start &&
668             page_offset(locked_page) <= end)
669                 __set_page_dirty_nobuffers(locked_page);
670                 /* unlocked later on in the async handlers */
671
672         if (redirty)
673                 extent_range_redirty_for_io(inode, start, end);
674         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
675                          BTRFS_COMPRESS_NONE);
676         *num_added += 1;
677
678         return;
679
680 free_pages_out:
681         for (i = 0; i < nr_pages; i++) {
682                 WARN_ON(pages[i]->mapping);
683                 put_page(pages[i]);
684         }
685         kfree(pages);
686 }
687
688 static void free_async_extent_pages(struct async_extent *async_extent)
689 {
690         int i;
691
692         if (!async_extent->pages)
693                 return;
694
695         for (i = 0; i < async_extent->nr_pages; i++) {
696                 WARN_ON(async_extent->pages[i]->mapping);
697                 put_page(async_extent->pages[i]);
698         }
699         kfree(async_extent->pages);
700         async_extent->nr_pages = 0;
701         async_extent->pages = NULL;
702 }
703
704 /*
705  * phase two of compressed writeback.  This is the ordered portion
706  * of the code, which only gets called in the order the work was
707  * queued.  We walk all the async extents created by compress_file_range
708  * and send them down to the disk.
709  */
710 static noinline void submit_compressed_extents(struct inode *inode,
711                                               struct async_cow *async_cow)
712 {
713         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
714         struct async_extent *async_extent;
715         u64 alloc_hint = 0;
716         struct btrfs_key ins;
717         struct extent_map *em;
718         struct btrfs_root *root = BTRFS_I(inode)->root;
719         struct extent_io_tree *io_tree;
720         int ret = 0;
721
722 again:
723         while (!list_empty(&async_cow->extents)) {
724                 async_extent = list_entry(async_cow->extents.next,
725                                           struct async_extent, list);
726                 list_del(&async_extent->list);
727
728                 io_tree = &BTRFS_I(inode)->io_tree;
729
730 retry:
731                 /* did the compression code fall back to uncompressed IO? */
732                 if (!async_extent->pages) {
733                         int page_started = 0;
734                         unsigned long nr_written = 0;
735
736                         lock_extent(io_tree, async_extent->start,
737                                          async_extent->start +
738                                          async_extent->ram_size - 1);
739
740                         /* allocate blocks */
741                         ret = cow_file_range(inode, async_cow->locked_page,
742                                              async_extent->start,
743                                              async_extent->start +
744                                              async_extent->ram_size - 1,
745                                              async_extent->start +
746                                              async_extent->ram_size - 1,
747                                              &page_started, &nr_written, 0,
748                                              NULL);
749
750                         /* JDM XXX */
751
752                         /*
753                          * if page_started, cow_file_range inserted an
754                          * inline extent and took care of all the unlocking
755                          * and IO for us.  Otherwise, we need to submit
756                          * all those pages down to the drive.
757                          */
758                         if (!page_started && !ret)
759                                 extent_write_locked_range(inode,
760                                                   async_extent->start,
761                                                   async_extent->start +
762                                                   async_extent->ram_size - 1,
763                                                   WB_SYNC_ALL);
764                         else if (ret)
765                                 unlock_page(async_cow->locked_page);
766                         kfree(async_extent);
767                         cond_resched();
768                         continue;
769                 }
770
771                 lock_extent(io_tree, async_extent->start,
772                             async_extent->start + async_extent->ram_size - 1);
773
774                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
775                                            async_extent->compressed_size,
776                                            async_extent->compressed_size,
777                                            0, alloc_hint, &ins, 1, 1);
778                 if (ret) {
779                         free_async_extent_pages(async_extent);
780
781                         if (ret == -ENOSPC) {
782                                 unlock_extent(io_tree, async_extent->start,
783                                               async_extent->start +
784                                               async_extent->ram_size - 1);
785
786                                 /*
787                                  * we need to redirty the pages if we decide to
788                                  * fallback to uncompressed IO, otherwise we
789                                  * will not submit these pages down to lower
790                                  * layers.
791                                  */
792                                 extent_range_redirty_for_io(inode,
793                                                 async_extent->start,
794                                                 async_extent->start +
795                                                 async_extent->ram_size - 1);
796
797                                 goto retry;
798                         }
799                         goto out_free;
800                 }
801                 /*
802                  * here we're doing allocation and writeback of the
803                  * compressed pages
804                  */
805                 em = create_io_em(inode, async_extent->start,
806                                   async_extent->ram_size, /* len */
807                                   async_extent->start, /* orig_start */
808                                   ins.objectid, /* block_start */
809                                   ins.offset, /* block_len */
810                                   ins.offset, /* orig_block_len */
811                                   async_extent->ram_size, /* ram_bytes */
812                                   async_extent->compress_type,
813                                   BTRFS_ORDERED_COMPRESSED);
814                 if (IS_ERR(em))
815                         /* ret value is not necessary due to void function */
816                         goto out_free_reserve;
817                 free_extent_map(em);
818
819                 ret = btrfs_add_ordered_extent_compress(inode,
820                                                 async_extent->start,
821                                                 ins.objectid,
822                                                 async_extent->ram_size,
823                                                 ins.offset,
824                                                 BTRFS_ORDERED_COMPRESSED,
825                                                 async_extent->compress_type);
826                 if (ret) {
827                         btrfs_drop_extent_cache(BTRFS_I(inode),
828                                                 async_extent->start,
829                                                 async_extent->start +
830                                                 async_extent->ram_size - 1, 0);
831                         goto out_free_reserve;
832                 }
833                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
834
835                 /*
836                  * clear dirty, set writeback and unlock the pages.
837                  */
838                 extent_clear_unlock_delalloc(inode, async_extent->start,
839                                 async_extent->start +
840                                 async_extent->ram_size - 1,
841                                 async_extent->start +
842                                 async_extent->ram_size - 1,
843                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
844                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
845                                 PAGE_SET_WRITEBACK);
846                 if (btrfs_submit_compressed_write(inode,
847                                     async_extent->start,
848                                     async_extent->ram_size,
849                                     ins.objectid,
850                                     ins.offset, async_extent->pages,
851                                     async_extent->nr_pages,
852                                     async_cow->write_flags)) {
853                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
854                         struct page *p = async_extent->pages[0];
855                         const u64 start = async_extent->start;
856                         const u64 end = start + async_extent->ram_size - 1;
857
858                         p->mapping = inode->i_mapping;
859                         tree->ops->writepage_end_io_hook(p, start, end,
860                                                          NULL, 0);
861                         p->mapping = NULL;
862                         extent_clear_unlock_delalloc(inode, start, end, end,
863                                                      NULL, 0,
864                                                      PAGE_END_WRITEBACK |
865                                                      PAGE_SET_ERROR);
866                         free_async_extent_pages(async_extent);
867                 }
868                 alloc_hint = ins.objectid + ins.offset;
869                 kfree(async_extent);
870                 cond_resched();
871         }
872         return;
873 out_free_reserve:
874         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
875         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
876 out_free:
877         extent_clear_unlock_delalloc(inode, async_extent->start,
878                                      async_extent->start +
879                                      async_extent->ram_size - 1,
880                                      async_extent->start +
881                                      async_extent->ram_size - 1,
882                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
883                                      EXTENT_DELALLOC_NEW |
884                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
885                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
886                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
887                                      PAGE_SET_ERROR);
888         free_async_extent_pages(async_extent);
889         kfree(async_extent);
890         goto again;
891 }
892
893 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
894                                       u64 num_bytes)
895 {
896         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
897         struct extent_map *em;
898         u64 alloc_hint = 0;
899
900         read_lock(&em_tree->lock);
901         em = search_extent_mapping(em_tree, start, num_bytes);
902         if (em) {
903                 /*
904                  * if block start isn't an actual block number then find the
905                  * first block in this inode and use that as a hint.  If that
906                  * block is also bogus then just don't worry about it.
907                  */
908                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
909                         free_extent_map(em);
910                         em = search_extent_mapping(em_tree, 0, 0);
911                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
912                                 alloc_hint = em->block_start;
913                         if (em)
914                                 free_extent_map(em);
915                 } else {
916                         alloc_hint = em->block_start;
917                         free_extent_map(em);
918                 }
919         }
920         read_unlock(&em_tree->lock);
921
922         return alloc_hint;
923 }
924
925 /*
926  * when extent_io.c finds a delayed allocation range in the file,
927  * the call backs end up in this code.  The basic idea is to
928  * allocate extents on disk for the range, and create ordered data structs
929  * in ram to track those extents.
930  *
931  * locked_page is the page that writepage had locked already.  We use
932  * it to make sure we don't do extra locks or unlocks.
933  *
934  * *page_started is set to one if we unlock locked_page and do everything
935  * required to start IO on it.  It may be clean and already done with
936  * IO when we return.
937  */
938 static noinline int cow_file_range(struct inode *inode,
939                                    struct page *locked_page,
940                                    u64 start, u64 end, u64 delalloc_end,
941                                    int *page_started, unsigned long *nr_written,
942                                    int unlock, struct btrfs_dedupe_hash *hash)
943 {
944         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
945         struct btrfs_root *root = BTRFS_I(inode)->root;
946         u64 alloc_hint = 0;
947         u64 num_bytes;
948         unsigned long ram_size;
949         u64 cur_alloc_size = 0;
950         u64 blocksize = fs_info->sectorsize;
951         struct btrfs_key ins;
952         struct extent_map *em;
953         unsigned clear_bits;
954         unsigned long page_ops;
955         bool extent_reserved = false;
956         int ret = 0;
957
958         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
959                 WARN_ON_ONCE(1);
960                 ret = -EINVAL;
961                 goto out_unlock;
962         }
963
964         num_bytes = ALIGN(end - start + 1, blocksize);
965         num_bytes = max(blocksize,  num_bytes);
966         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
967
968         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
969
970         if (start == 0) {
971                 /* lets try to make an inline extent */
972                 ret = cow_file_range_inline(inode, start, end, 0,
973                                             BTRFS_COMPRESS_NONE, NULL);
974                 if (ret == 0) {
975                         /*
976                          * We use DO_ACCOUNTING here because we need the
977                          * delalloc_release_metadata to be run _after_ we drop
978                          * our outstanding extent for clearing delalloc for this
979                          * range.
980                          */
981                         extent_clear_unlock_delalloc(inode, start, end,
982                                      delalloc_end, NULL,
983                                      EXTENT_LOCKED | EXTENT_DELALLOC |
984                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
985                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
986                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
987                                      PAGE_END_WRITEBACK);
988                         *nr_written = *nr_written +
989                              (end - start + PAGE_SIZE) / PAGE_SIZE;
990                         *page_started = 1;
991                         goto out;
992                 } else if (ret < 0) {
993                         goto out_unlock;
994                 }
995         }
996
997         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
998         btrfs_drop_extent_cache(BTRFS_I(inode), start,
999                         start + num_bytes - 1, 0);
1000
1001         while (num_bytes > 0) {
1002                 cur_alloc_size = num_bytes;
1003                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1004                                            fs_info->sectorsize, 0, alloc_hint,
1005                                            &ins, 1, 1);
1006                 if (ret < 0)
1007                         goto out_unlock;
1008                 cur_alloc_size = ins.offset;
1009                 extent_reserved = true;
1010
1011                 ram_size = ins.offset;
1012                 em = create_io_em(inode, start, ins.offset, /* len */
1013                                   start, /* orig_start */
1014                                   ins.objectid, /* block_start */
1015                                   ins.offset, /* block_len */
1016                                   ins.offset, /* orig_block_len */
1017                                   ram_size, /* ram_bytes */
1018                                   BTRFS_COMPRESS_NONE, /* compress_type */
1019                                   BTRFS_ORDERED_REGULAR /* type */);
1020                 if (IS_ERR(em))
1021                         goto out_reserve;
1022                 free_extent_map(em);
1023
1024                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1025                                                ram_size, cur_alloc_size, 0);
1026                 if (ret)
1027                         goto out_drop_extent_cache;
1028
1029                 if (root->root_key.objectid ==
1030                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1031                         ret = btrfs_reloc_clone_csums(inode, start,
1032                                                       cur_alloc_size);
1033                         /*
1034                          * Only drop cache here, and process as normal.
1035                          *
1036                          * We must not allow extent_clear_unlock_delalloc()
1037                          * at out_unlock label to free meta of this ordered
1038                          * extent, as its meta should be freed by
1039                          * btrfs_finish_ordered_io().
1040                          *
1041                          * So we must continue until @start is increased to
1042                          * skip current ordered extent.
1043                          */
1044                         if (ret)
1045                                 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1046                                                 start + ram_size - 1, 0);
1047                 }
1048
1049                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1050
1051                 /* we're not doing compressed IO, don't unlock the first
1052                  * page (which the caller expects to stay locked), don't
1053                  * clear any dirty bits and don't set any writeback bits
1054                  *
1055                  * Do set the Private2 bit so we know this page was properly
1056                  * setup for writepage
1057                  */
1058                 page_ops = unlock ? PAGE_UNLOCK : 0;
1059                 page_ops |= PAGE_SET_PRIVATE2;
1060
1061                 extent_clear_unlock_delalloc(inode, start,
1062                                              start + ram_size - 1,
1063                                              delalloc_end, locked_page,
1064                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1065                                              page_ops);
1066                 if (num_bytes < cur_alloc_size)
1067                         num_bytes = 0;
1068                 else
1069                         num_bytes -= cur_alloc_size;
1070                 alloc_hint = ins.objectid + ins.offset;
1071                 start += cur_alloc_size;
1072                 extent_reserved = false;
1073
1074                 /*
1075                  * btrfs_reloc_clone_csums() error, since start is increased
1076                  * extent_clear_unlock_delalloc() at out_unlock label won't
1077                  * free metadata of current ordered extent, we're OK to exit.
1078                  */
1079                 if (ret)
1080                         goto out_unlock;
1081         }
1082 out:
1083         return ret;
1084
1085 out_drop_extent_cache:
1086         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1087 out_reserve:
1088         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1089         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1090 out_unlock:
1091         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1092                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1093         page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1094                 PAGE_END_WRITEBACK;
1095         /*
1096          * If we reserved an extent for our delalloc range (or a subrange) and
1097          * failed to create the respective ordered extent, then it means that
1098          * when we reserved the extent we decremented the extent's size from
1099          * the data space_info's bytes_may_use counter and incremented the
1100          * space_info's bytes_reserved counter by the same amount. We must make
1101          * sure extent_clear_unlock_delalloc() does not try to decrement again
1102          * the data space_info's bytes_may_use counter, therefore we do not pass
1103          * it the flag EXTENT_CLEAR_DATA_RESV.
1104          */
1105         if (extent_reserved) {
1106                 extent_clear_unlock_delalloc(inode, start,
1107                                              start + cur_alloc_size,
1108                                              start + cur_alloc_size,
1109                                              locked_page,
1110                                              clear_bits,
1111                                              page_ops);
1112                 start += cur_alloc_size;
1113                 if (start >= end)
1114                         goto out;
1115         }
1116         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1117                                      locked_page,
1118                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1119                                      page_ops);
1120         goto out;
1121 }
1122
1123 /*
1124  * work queue call back to started compression on a file and pages
1125  */
1126 static noinline void async_cow_start(struct btrfs_work *work)
1127 {
1128         struct async_cow *async_cow;
1129         int num_added = 0;
1130         async_cow = container_of(work, struct async_cow, work);
1131
1132         compress_file_range(async_cow->inode, async_cow->locked_page,
1133                             async_cow->start, async_cow->end, async_cow,
1134                             &num_added);
1135         if (num_added == 0) {
1136                 btrfs_add_delayed_iput(async_cow->inode);
1137                 async_cow->inode = NULL;
1138         }
1139 }
1140
1141 /*
1142  * work queue call back to submit previously compressed pages
1143  */
1144 static noinline void async_cow_submit(struct btrfs_work *work)
1145 {
1146         struct btrfs_fs_info *fs_info;
1147         struct async_cow *async_cow;
1148         struct btrfs_root *root;
1149         unsigned long nr_pages;
1150
1151         async_cow = container_of(work, struct async_cow, work);
1152
1153         root = async_cow->root;
1154         fs_info = root->fs_info;
1155         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1156                 PAGE_SHIFT;
1157
1158         /*
1159          * atomic_sub_return implies a barrier for waitqueue_active
1160          */
1161         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1162             5 * SZ_1M &&
1163             waitqueue_active(&fs_info->async_submit_wait))
1164                 wake_up(&fs_info->async_submit_wait);
1165
1166         if (async_cow->inode)
1167                 submit_compressed_extents(async_cow->inode, async_cow);
1168 }
1169
1170 static noinline void async_cow_free(struct btrfs_work *work)
1171 {
1172         struct async_cow *async_cow;
1173         async_cow = container_of(work, struct async_cow, work);
1174         if (async_cow->inode)
1175                 btrfs_add_delayed_iput(async_cow->inode);
1176         kfree(async_cow);
1177 }
1178
1179 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1180                                 u64 start, u64 end, int *page_started,
1181                                 unsigned long *nr_written,
1182                                 unsigned int write_flags)
1183 {
1184         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1185         struct async_cow *async_cow;
1186         struct btrfs_root *root = BTRFS_I(inode)->root;
1187         unsigned long nr_pages;
1188         u64 cur_end;
1189
1190         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1191                          1, 0, NULL);
1192         while (start < end) {
1193                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1194                 BUG_ON(!async_cow); /* -ENOMEM */
1195                 async_cow->inode = igrab(inode);
1196                 async_cow->root = root;
1197                 async_cow->locked_page = locked_page;
1198                 async_cow->start = start;
1199                 async_cow->write_flags = write_flags;
1200
1201                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1202                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1203                         cur_end = end;
1204                 else
1205                         cur_end = min(end, start + SZ_512K - 1);
1206
1207                 async_cow->end = cur_end;
1208                 INIT_LIST_HEAD(&async_cow->extents);
1209
1210                 btrfs_init_work(&async_cow->work,
1211                                 btrfs_delalloc_helper,
1212                                 async_cow_start, async_cow_submit,
1213                                 async_cow_free);
1214
1215                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1216                         PAGE_SHIFT;
1217                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1218
1219                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1220
1221                 *nr_written += nr_pages;
1222                 start = cur_end + 1;
1223         }
1224         *page_started = 1;
1225         return 0;
1226 }
1227
1228 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1229                                         u64 bytenr, u64 num_bytes)
1230 {
1231         int ret;
1232         struct btrfs_ordered_sum *sums;
1233         LIST_HEAD(list);
1234
1235         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1236                                        bytenr + num_bytes - 1, &list, 0);
1237         if (ret == 0 && list_empty(&list))
1238                 return 0;
1239
1240         while (!list_empty(&list)) {
1241                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1242                 list_del(&sums->list);
1243                 kfree(sums);
1244         }
1245         if (ret < 0)
1246                 return ret;
1247         return 1;
1248 }
1249
1250 /*
1251  * when nowcow writeback call back.  This checks for snapshots or COW copies
1252  * of the extents that exist in the file, and COWs the file as required.
1253  *
1254  * If no cow copies or snapshots exist, we write directly to the existing
1255  * blocks on disk
1256  */
1257 static noinline int run_delalloc_nocow(struct inode *inode,
1258                                        struct page *locked_page,
1259                               u64 start, u64 end, int *page_started, int force,
1260                               unsigned long *nr_written)
1261 {
1262         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1263         struct btrfs_root *root = BTRFS_I(inode)->root;
1264         struct extent_buffer *leaf;
1265         struct btrfs_path *path;
1266         struct btrfs_file_extent_item *fi;
1267         struct btrfs_key found_key;
1268         struct extent_map *em;
1269         u64 cow_start;
1270         u64 cur_offset;
1271         u64 extent_end;
1272         u64 extent_offset;
1273         u64 disk_bytenr;
1274         u64 num_bytes;
1275         u64 disk_num_bytes;
1276         u64 ram_bytes;
1277         int extent_type;
1278         int ret, err;
1279         int type;
1280         int nocow;
1281         int check_prev = 1;
1282         bool nolock;
1283         u64 ino = btrfs_ino(BTRFS_I(inode));
1284
1285         path = btrfs_alloc_path();
1286         if (!path) {
1287                 extent_clear_unlock_delalloc(inode, start, end, end,
1288                                              locked_page,
1289                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1290                                              EXTENT_DO_ACCOUNTING |
1291                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1292                                              PAGE_CLEAR_DIRTY |
1293                                              PAGE_SET_WRITEBACK |
1294                                              PAGE_END_WRITEBACK);
1295                 return -ENOMEM;
1296         }
1297
1298         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1299
1300         cow_start = (u64)-1;
1301         cur_offset = start;
1302         while (1) {
1303                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1304                                                cur_offset, 0);
1305                 if (ret < 0)
1306                         goto error;
1307                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1308                         leaf = path->nodes[0];
1309                         btrfs_item_key_to_cpu(leaf, &found_key,
1310                                               path->slots[0] - 1);
1311                         if (found_key.objectid == ino &&
1312                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1313                                 path->slots[0]--;
1314                 }
1315                 check_prev = 0;
1316 next_slot:
1317                 leaf = path->nodes[0];
1318                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1319                         ret = btrfs_next_leaf(root, path);
1320                         if (ret < 0) {
1321                                 if (cow_start != (u64)-1)
1322                                         cur_offset = cow_start;
1323                                 goto error;
1324                         }
1325                         if (ret > 0)
1326                                 break;
1327                         leaf = path->nodes[0];
1328                 }
1329
1330                 nocow = 0;
1331                 disk_bytenr = 0;
1332                 num_bytes = 0;
1333                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1334
1335                 if (found_key.objectid > ino)
1336                         break;
1337                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1338                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1339                         path->slots[0]++;
1340                         goto next_slot;
1341                 }
1342                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1343                     found_key.offset > end)
1344                         break;
1345
1346                 if (found_key.offset > cur_offset) {
1347                         extent_end = found_key.offset;
1348                         extent_type = 0;
1349                         goto out_check;
1350                 }
1351
1352                 fi = btrfs_item_ptr(leaf, path->slots[0],
1353                                     struct btrfs_file_extent_item);
1354                 extent_type = btrfs_file_extent_type(leaf, fi);
1355
1356                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1357                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1358                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1359                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1360                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1361                         extent_end = found_key.offset +
1362                                 btrfs_file_extent_num_bytes(leaf, fi);
1363                         disk_num_bytes =
1364                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1365                         if (extent_end <= start) {
1366                                 path->slots[0]++;
1367                                 goto next_slot;
1368                         }
1369                         if (disk_bytenr == 0)
1370                                 goto out_check;
1371                         if (btrfs_file_extent_compression(leaf, fi) ||
1372                             btrfs_file_extent_encryption(leaf, fi) ||
1373                             btrfs_file_extent_other_encoding(leaf, fi))
1374                                 goto out_check;
1375                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1376                                 goto out_check;
1377                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1378                                 goto out_check;
1379                         ret = btrfs_cross_ref_exist(root, ino,
1380                                                     found_key.offset -
1381                                                     extent_offset, disk_bytenr);
1382                         if (ret) {
1383                                 /*
1384                                  * ret could be -EIO if the above fails to read
1385                                  * metadata.
1386                                  */
1387                                 if (ret < 0) {
1388                                         if (cow_start != (u64)-1)
1389                                                 cur_offset = cow_start;
1390                                         goto error;
1391                                 }
1392
1393                                 WARN_ON_ONCE(nolock);
1394                                 goto out_check;
1395                         }
1396                         disk_bytenr += extent_offset;
1397                         disk_bytenr += cur_offset - found_key.offset;
1398                         num_bytes = min(end + 1, extent_end) - cur_offset;
1399                         /*
1400                          * if there are pending snapshots for this root,
1401                          * we fall into common COW way.
1402                          */
1403                         if (!nolock) {
1404                                 err = btrfs_start_write_no_snapshotting(root);
1405                                 if (!err)
1406                                         goto out_check;
1407                         }
1408                         /*
1409                          * force cow if csum exists in the range.
1410                          * this ensure that csum for a given extent are
1411                          * either valid or do not exist.
1412                          */
1413                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1414                                                   num_bytes);
1415                         if (ret) {
1416                                 if (!nolock)
1417                                         btrfs_end_write_no_snapshotting(root);
1418
1419                                 /*
1420                                  * ret could be -EIO if the above fails to read
1421                                  * metadata.
1422                                  */
1423                                 if (ret < 0) {
1424                                         if (cow_start != (u64)-1)
1425                                                 cur_offset = cow_start;
1426                                         goto error;
1427                                 }
1428                                 WARN_ON_ONCE(nolock);
1429                                 goto out_check;
1430                         }
1431                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1432                                 if (!nolock)
1433                                         btrfs_end_write_no_snapshotting(root);
1434                                 goto out_check;
1435                         }
1436                         nocow = 1;
1437                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1438                         extent_end = found_key.offset +
1439                                 btrfs_file_extent_inline_len(leaf,
1440                                                      path->slots[0], fi);
1441                         extent_end = ALIGN(extent_end,
1442                                            fs_info->sectorsize);
1443                 } else {
1444                         BUG_ON(1);
1445                 }
1446 out_check:
1447                 if (extent_end <= start) {
1448                         path->slots[0]++;
1449                         if (!nolock && nocow)
1450                                 btrfs_end_write_no_snapshotting(root);
1451                         if (nocow)
1452                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1453                         goto next_slot;
1454                 }
1455                 if (!nocow) {
1456                         if (cow_start == (u64)-1)
1457                                 cow_start = cur_offset;
1458                         cur_offset = extent_end;
1459                         if (cur_offset > end)
1460                                 break;
1461                         path->slots[0]++;
1462                         goto next_slot;
1463                 }
1464
1465                 btrfs_release_path(path);
1466                 if (cow_start != (u64)-1) {
1467                         ret = cow_file_range(inode, locked_page,
1468                                              cow_start, found_key.offset - 1,
1469                                              end, page_started, nr_written, 1,
1470                                              NULL);
1471                         if (ret) {
1472                                 if (!nolock && nocow)
1473                                         btrfs_end_write_no_snapshotting(root);
1474                                 if (nocow)
1475                                         btrfs_dec_nocow_writers(fs_info,
1476                                                                 disk_bytenr);
1477                                 goto error;
1478                         }
1479                         cow_start = (u64)-1;
1480                 }
1481
1482                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1483                         u64 orig_start = found_key.offset - extent_offset;
1484
1485                         em = create_io_em(inode, cur_offset, num_bytes,
1486                                           orig_start,
1487                                           disk_bytenr, /* block_start */
1488                                           num_bytes, /* block_len */
1489                                           disk_num_bytes, /* orig_block_len */
1490                                           ram_bytes, BTRFS_COMPRESS_NONE,
1491                                           BTRFS_ORDERED_PREALLOC);
1492                         if (IS_ERR(em)) {
1493                                 if (!nolock && nocow)
1494                                         btrfs_end_write_no_snapshotting(root);
1495                                 if (nocow)
1496                                         btrfs_dec_nocow_writers(fs_info,
1497                                                                 disk_bytenr);
1498                                 ret = PTR_ERR(em);
1499                                 goto error;
1500                         }
1501                         free_extent_map(em);
1502                 }
1503
1504                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1505                         type = BTRFS_ORDERED_PREALLOC;
1506                 } else {
1507                         type = BTRFS_ORDERED_NOCOW;
1508                 }
1509
1510                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1511                                                num_bytes, num_bytes, type);
1512                 if (nocow)
1513                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1514                 BUG_ON(ret); /* -ENOMEM */
1515
1516                 if (root->root_key.objectid ==
1517                     BTRFS_DATA_RELOC_TREE_OBJECTID)
1518                         /*
1519                          * Error handled later, as we must prevent
1520                          * extent_clear_unlock_delalloc() in error handler
1521                          * from freeing metadata of created ordered extent.
1522                          */
1523                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1524                                                       num_bytes);
1525
1526                 extent_clear_unlock_delalloc(inode, cur_offset,
1527                                              cur_offset + num_bytes - 1, end,
1528                                              locked_page, EXTENT_LOCKED |
1529                                              EXTENT_DELALLOC |
1530                                              EXTENT_CLEAR_DATA_RESV,
1531                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1532
1533                 if (!nolock && nocow)
1534                         btrfs_end_write_no_snapshotting(root);
1535                 cur_offset = extent_end;
1536
1537                 /*
1538                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1539                  * handler, as metadata for created ordered extent will only
1540                  * be freed by btrfs_finish_ordered_io().
1541                  */
1542                 if (ret)
1543                         goto error;
1544                 if (cur_offset > end)
1545                         break;
1546         }
1547         btrfs_release_path(path);
1548
1549         if (cur_offset <= end && cow_start == (u64)-1) {
1550                 cow_start = cur_offset;
1551                 cur_offset = end;
1552         }
1553
1554         if (cow_start != (u64)-1) {
1555                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1556                                      page_started, nr_written, 1, NULL);
1557                 if (ret)
1558                         goto error;
1559         }
1560
1561 error:
1562         if (ret && cur_offset < end)
1563                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1564                                              locked_page, EXTENT_LOCKED |
1565                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1566                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1567                                              PAGE_CLEAR_DIRTY |
1568                                              PAGE_SET_WRITEBACK |
1569                                              PAGE_END_WRITEBACK);
1570         btrfs_free_path(path);
1571         return ret;
1572 }
1573
1574 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1575 {
1576
1577         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1578             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1579                 return 0;
1580
1581         /*
1582          * @defrag_bytes is a hint value, no spinlock held here,
1583          * if is not zero, it means the file is defragging.
1584          * Force cow if given extent needs to be defragged.
1585          */
1586         if (BTRFS_I(inode)->defrag_bytes &&
1587             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1588                            EXTENT_DEFRAG, 0, NULL))
1589                 return 1;
1590
1591         return 0;
1592 }
1593
1594 /*
1595  * extent_io.c call back to do delayed allocation processing
1596  */
1597 static int run_delalloc_range(void *private_data, struct page *locked_page,
1598                               u64 start, u64 end, int *page_started,
1599                               unsigned long *nr_written,
1600                               struct writeback_control *wbc)
1601 {
1602         struct inode *inode = private_data;
1603         int ret;
1604         int force_cow = need_force_cow(inode, start, end);
1605         unsigned int write_flags = wbc_to_write_flags(wbc);
1606
1607         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1608                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1609                                          page_started, 1, nr_written);
1610         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1611                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1612                                          page_started, 0, nr_written);
1613         } else if (!inode_need_compress(inode, start, end)) {
1614                 ret = cow_file_range(inode, locked_page, start, end, end,
1615                                       page_started, nr_written, 1, NULL);
1616         } else {
1617                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1618                         &BTRFS_I(inode)->runtime_flags);
1619                 ret = cow_file_range_async(inode, locked_page, start, end,
1620                                            page_started, nr_written,
1621                                            write_flags);
1622         }
1623         if (ret)
1624                 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1625         return ret;
1626 }
1627
1628 static void btrfs_split_extent_hook(void *private_data,
1629                                     struct extent_state *orig, u64 split)
1630 {
1631         struct inode *inode = private_data;
1632         u64 size;
1633
1634         /* not delalloc, ignore it */
1635         if (!(orig->state & EXTENT_DELALLOC))
1636                 return;
1637
1638         size = orig->end - orig->start + 1;
1639         if (size > BTRFS_MAX_EXTENT_SIZE) {
1640                 u32 num_extents;
1641                 u64 new_size;
1642
1643                 /*
1644                  * See the explanation in btrfs_merge_extent_hook, the same
1645                  * applies here, just in reverse.
1646                  */
1647                 new_size = orig->end - split + 1;
1648                 num_extents = count_max_extents(new_size);
1649                 new_size = split - orig->start;
1650                 num_extents += count_max_extents(new_size);
1651                 if (count_max_extents(size) >= num_extents)
1652                         return;
1653         }
1654
1655         spin_lock(&BTRFS_I(inode)->lock);
1656         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1657         spin_unlock(&BTRFS_I(inode)->lock);
1658 }
1659
1660 /*
1661  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1662  * extents so we can keep track of new extents that are just merged onto old
1663  * extents, such as when we are doing sequential writes, so we can properly
1664  * account for the metadata space we'll need.
1665  */
1666 static void btrfs_merge_extent_hook(void *private_data,
1667                                     struct extent_state *new,
1668                                     struct extent_state *other)
1669 {
1670         struct inode *inode = private_data;
1671         u64 new_size, old_size;
1672         u32 num_extents;
1673
1674         /* not delalloc, ignore it */
1675         if (!(other->state & EXTENT_DELALLOC))
1676                 return;
1677
1678         if (new->start > other->start)
1679                 new_size = new->end - other->start + 1;
1680         else
1681                 new_size = other->end - new->start + 1;
1682
1683         /* we're not bigger than the max, unreserve the space and go */
1684         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1685                 spin_lock(&BTRFS_I(inode)->lock);
1686                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1687                 spin_unlock(&BTRFS_I(inode)->lock);
1688                 return;
1689         }
1690
1691         /*
1692          * We have to add up either side to figure out how many extents were
1693          * accounted for before we merged into one big extent.  If the number of
1694          * extents we accounted for is <= the amount we need for the new range
1695          * then we can return, otherwise drop.  Think of it like this
1696          *
1697          * [ 4k][MAX_SIZE]
1698          *
1699          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1700          * need 2 outstanding extents, on one side we have 1 and the other side
1701          * we have 1 so they are == and we can return.  But in this case
1702          *
1703          * [MAX_SIZE+4k][MAX_SIZE+4k]
1704          *
1705          * Each range on their own accounts for 2 extents, but merged together
1706          * they are only 3 extents worth of accounting, so we need to drop in
1707          * this case.
1708          */
1709         old_size = other->end - other->start + 1;
1710         num_extents = count_max_extents(old_size);
1711         old_size = new->end - new->start + 1;
1712         num_extents += count_max_extents(old_size);
1713         if (count_max_extents(new_size) >= num_extents)
1714                 return;
1715
1716         spin_lock(&BTRFS_I(inode)->lock);
1717         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1718         spin_unlock(&BTRFS_I(inode)->lock);
1719 }
1720
1721 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1722                                       struct inode *inode)
1723 {
1724         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1725
1726         spin_lock(&root->delalloc_lock);
1727         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1728                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1729                               &root->delalloc_inodes);
1730                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1731                         &BTRFS_I(inode)->runtime_flags);
1732                 root->nr_delalloc_inodes++;
1733                 if (root->nr_delalloc_inodes == 1) {
1734                         spin_lock(&fs_info->delalloc_root_lock);
1735                         BUG_ON(!list_empty(&root->delalloc_root));
1736                         list_add_tail(&root->delalloc_root,
1737                                       &fs_info->delalloc_roots);
1738                         spin_unlock(&fs_info->delalloc_root_lock);
1739                 }
1740         }
1741         spin_unlock(&root->delalloc_lock);
1742 }
1743
1744 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1745                                      struct btrfs_inode *inode)
1746 {
1747         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1748
1749         spin_lock(&root->delalloc_lock);
1750         if (!list_empty(&inode->delalloc_inodes)) {
1751                 list_del_init(&inode->delalloc_inodes);
1752                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1753                           &inode->runtime_flags);
1754                 root->nr_delalloc_inodes--;
1755                 if (!root->nr_delalloc_inodes) {
1756                         spin_lock(&fs_info->delalloc_root_lock);
1757                         BUG_ON(list_empty(&root->delalloc_root));
1758                         list_del_init(&root->delalloc_root);
1759                         spin_unlock(&fs_info->delalloc_root_lock);
1760                 }
1761         }
1762         spin_unlock(&root->delalloc_lock);
1763 }
1764
1765 /*
1766  * extent_io.c set_bit_hook, used to track delayed allocation
1767  * bytes in this file, and to maintain the list of inodes that
1768  * have pending delalloc work to be done.
1769  */
1770 static void btrfs_set_bit_hook(void *private_data,
1771                                struct extent_state *state, unsigned *bits)
1772 {
1773         struct inode *inode = private_data;
1774
1775         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1776
1777         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1778                 WARN_ON(1);
1779         /*
1780          * set_bit and clear bit hooks normally require _irqsave/restore
1781          * but in this case, we are only testing for the DELALLOC
1782          * bit, which is only set or cleared with irqs on
1783          */
1784         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1785                 struct btrfs_root *root = BTRFS_I(inode)->root;
1786                 u64 len = state->end + 1 - state->start;
1787                 u32 num_extents = count_max_extents(len);
1788                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1789
1790                 spin_lock(&BTRFS_I(inode)->lock);
1791                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1792                 spin_unlock(&BTRFS_I(inode)->lock);
1793
1794                 /* For sanity tests */
1795                 if (btrfs_is_testing(fs_info))
1796                         return;
1797
1798                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1799                                          fs_info->delalloc_batch);
1800                 spin_lock(&BTRFS_I(inode)->lock);
1801                 BTRFS_I(inode)->delalloc_bytes += len;
1802                 if (*bits & EXTENT_DEFRAG)
1803                         BTRFS_I(inode)->defrag_bytes += len;
1804                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1805                                          &BTRFS_I(inode)->runtime_flags))
1806                         btrfs_add_delalloc_inodes(root, inode);
1807                 spin_unlock(&BTRFS_I(inode)->lock);
1808         }
1809
1810         if (!(state->state & EXTENT_DELALLOC_NEW) &&
1811             (*bits & EXTENT_DELALLOC_NEW)) {
1812                 spin_lock(&BTRFS_I(inode)->lock);
1813                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1814                         state->start;
1815                 spin_unlock(&BTRFS_I(inode)->lock);
1816         }
1817 }
1818
1819 /*
1820  * extent_io.c clear_bit_hook, see set_bit_hook for why
1821  */
1822 static void btrfs_clear_bit_hook(void *private_data,
1823                                  struct extent_state *state,
1824                                  unsigned *bits)
1825 {
1826         struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1827         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1828         u64 len = state->end + 1 - state->start;
1829         u32 num_extents = count_max_extents(len);
1830
1831         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1832                 spin_lock(&inode->lock);
1833                 inode->defrag_bytes -= len;
1834                 spin_unlock(&inode->lock);
1835         }
1836
1837         /*
1838          * set_bit and clear bit hooks normally require _irqsave/restore
1839          * but in this case, we are only testing for the DELALLOC
1840          * bit, which is only set or cleared with irqs on
1841          */
1842         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1843                 struct btrfs_root *root = inode->root;
1844                 bool do_list = !btrfs_is_free_space_inode(inode);
1845
1846                 spin_lock(&inode->lock);
1847                 btrfs_mod_outstanding_extents(inode, -num_extents);
1848                 spin_unlock(&inode->lock);
1849
1850                 /*
1851                  * We don't reserve metadata space for space cache inodes so we
1852                  * don't need to call dellalloc_release_metadata if there is an
1853                  * error.
1854                  */
1855                 if (*bits & EXTENT_CLEAR_META_RESV &&
1856                     root != fs_info->tree_root)
1857                         btrfs_delalloc_release_metadata(inode, len, false);
1858
1859                 /* For sanity tests. */
1860                 if (btrfs_is_testing(fs_info))
1861                         return;
1862
1863                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1864                     do_list && !(state->state & EXTENT_NORESERVE) &&
1865                     (*bits & EXTENT_CLEAR_DATA_RESV))
1866                         btrfs_free_reserved_data_space_noquota(
1867                                         &inode->vfs_inode,
1868                                         state->start, len);
1869
1870                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1871                                          fs_info->delalloc_batch);
1872                 spin_lock(&inode->lock);
1873                 inode->delalloc_bytes -= len;
1874                 if (do_list && inode->delalloc_bytes == 0 &&
1875                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1876                                         &inode->runtime_flags))
1877                         btrfs_del_delalloc_inode(root, inode);
1878                 spin_unlock(&inode->lock);
1879         }
1880
1881         if ((state->state & EXTENT_DELALLOC_NEW) &&
1882             (*bits & EXTENT_DELALLOC_NEW)) {
1883                 spin_lock(&inode->lock);
1884                 ASSERT(inode->new_delalloc_bytes >= len);
1885                 inode->new_delalloc_bytes -= len;
1886                 spin_unlock(&inode->lock);
1887         }
1888 }
1889
1890 /*
1891  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1892  * we don't create bios that span stripes or chunks
1893  *
1894  * return 1 if page cannot be merged to bio
1895  * return 0 if page can be merged to bio
1896  * return error otherwise
1897  */
1898 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1899                          size_t size, struct bio *bio,
1900                          unsigned long bio_flags)
1901 {
1902         struct inode *inode = page->mapping->host;
1903         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1904         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1905         u64 length = 0;
1906         u64 map_length;
1907         int ret;
1908
1909         if (bio_flags & EXTENT_BIO_COMPRESSED)
1910                 return 0;
1911
1912         length = bio->bi_iter.bi_size;
1913         map_length = length;
1914         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1915                               NULL, 0);
1916         if (ret < 0)
1917                 return ret;
1918         if (map_length < length + size)
1919                 return 1;
1920         return 0;
1921 }
1922
1923 /*
1924  * in order to insert checksums into the metadata in large chunks,
1925  * we wait until bio submission time.   All the pages in the bio are
1926  * checksummed and sums are attached onto the ordered extent record.
1927  *
1928  * At IO completion time the cums attached on the ordered extent record
1929  * are inserted into the btree
1930  */
1931 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1932                                     u64 bio_offset)
1933 {
1934         struct inode *inode = private_data;
1935         blk_status_t ret = 0;
1936
1937         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1938         BUG_ON(ret); /* -ENOMEM */
1939         return 0;
1940 }
1941
1942 /*
1943  * in order to insert checksums into the metadata in large chunks,
1944  * we wait until bio submission time.   All the pages in the bio are
1945  * checksummed and sums are attached onto the ordered extent record.
1946  *
1947  * At IO completion time the cums attached on the ordered extent record
1948  * are inserted into the btree
1949  */
1950 static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
1951                           int mirror_num)
1952 {
1953         struct inode *inode = private_data;
1954         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1955         blk_status_t ret;
1956
1957         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1958         if (ret) {
1959                 bio->bi_status = ret;
1960                 bio_endio(bio);
1961         }
1962         return ret;
1963 }
1964
1965 /*
1966  * extent_io.c submission hook. This does the right thing for csum calculation
1967  * on write, or reading the csums from the tree before a read.
1968  *
1969  * Rules about async/sync submit,
1970  * a) read:                             sync submit
1971  *
1972  * b) write without checksum:           sync submit
1973  *
1974  * c) write with checksum:
1975  *    c-1) if bio is issued by fsync:   sync submit
1976  *         (sync_writers != 0)
1977  *
1978  *    c-2) if root is reloc root:       sync submit
1979  *         (only in case of buffered IO)
1980  *
1981  *    c-3) otherwise:                   async submit
1982  */
1983 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1984                                  int mirror_num, unsigned long bio_flags,
1985                                  u64 bio_offset)
1986 {
1987         struct inode *inode = private_data;
1988         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1989         struct btrfs_root *root = BTRFS_I(inode)->root;
1990         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1991         blk_status_t ret = 0;
1992         int skip_sum;
1993         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1994
1995         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1996
1997         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1998                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1999
2000         if (bio_op(bio) != REQ_OP_WRITE) {
2001                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2002                 if (ret)
2003                         goto out;
2004
2005                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2006                         ret = btrfs_submit_compressed_read(inode, bio,
2007                                                            mirror_num,
2008                                                            bio_flags);
2009                         goto out;
2010                 } else if (!skip_sum) {
2011                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2012                         if (ret)
2013                                 goto out;
2014                 }
2015                 goto mapit;
2016         } else if (async && !skip_sum) {
2017                 /* csum items have already been cloned */
2018                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2019                         goto mapit;
2020                 /* we're doing a write, do the async checksumming */
2021                 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2022                                           bio_offset, inode,
2023                                           btrfs_submit_bio_start,
2024                                           btrfs_submit_bio_done);
2025                 goto out;
2026         } else if (!skip_sum) {
2027                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2028                 if (ret)
2029                         goto out;
2030         }
2031
2032 mapit:
2033         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2034
2035 out:
2036         if (ret) {
2037                 bio->bi_status = ret;
2038                 bio_endio(bio);
2039         }
2040         return ret;
2041 }
2042
2043 /*
2044  * given a list of ordered sums record them in the inode.  This happens
2045  * at IO completion time based on sums calculated at bio submission time.
2046  */
2047 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2048                              struct inode *inode, struct list_head *list)
2049 {
2050         struct btrfs_ordered_sum *sum;
2051         int ret;
2052
2053         list_for_each_entry(sum, list, list) {
2054                 trans->adding_csums = true;
2055                 ret = btrfs_csum_file_blocks(trans,
2056                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2057                 trans->adding_csums = false;
2058                 if (ret)
2059                         return ret;
2060         }
2061         return 0;
2062 }
2063
2064 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2065                               unsigned int extra_bits,
2066                               struct extent_state **cached_state, int dedupe)
2067 {
2068         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2069         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2070                                    extra_bits, cached_state);
2071 }
2072
2073 /* see btrfs_writepage_start_hook for details on why this is required */
2074 struct btrfs_writepage_fixup {
2075         struct page *page;
2076         struct btrfs_work work;
2077 };
2078
2079 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2080 {
2081         struct btrfs_writepage_fixup *fixup;
2082         struct btrfs_ordered_extent *ordered;
2083         struct extent_state *cached_state = NULL;
2084         struct extent_changeset *data_reserved = NULL;
2085         struct page *page;
2086         struct inode *inode;
2087         u64 page_start;
2088         u64 page_end;
2089         int ret;
2090
2091         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2092         page = fixup->page;
2093 again:
2094         lock_page(page);
2095         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2096                 ClearPageChecked(page);
2097                 goto out_page;
2098         }
2099
2100         inode = page->mapping->host;
2101         page_start = page_offset(page);
2102         page_end = page_offset(page) + PAGE_SIZE - 1;
2103
2104         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2105                          &cached_state);
2106
2107         /* already ordered? We're done */
2108         if (PagePrivate2(page))
2109                 goto out;
2110
2111         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2112                                         PAGE_SIZE);
2113         if (ordered) {
2114                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2115                                      page_end, &cached_state);
2116                 unlock_page(page);
2117                 btrfs_start_ordered_extent(inode, ordered, 1);
2118                 btrfs_put_ordered_extent(ordered);
2119                 goto again;
2120         }
2121
2122         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2123                                            PAGE_SIZE);
2124         if (ret) {
2125                 mapping_set_error(page->mapping, ret);
2126                 end_extent_writepage(page, ret, page_start, page_end);
2127                 ClearPageChecked(page);
2128                 goto out;
2129          }
2130
2131         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2132                                         &cached_state, 0);
2133         if (ret) {
2134                 mapping_set_error(page->mapping, ret);
2135                 end_extent_writepage(page, ret, page_start, page_end);
2136                 ClearPageChecked(page);
2137                 goto out;
2138         }
2139
2140         ClearPageChecked(page);
2141         set_page_dirty(page);
2142         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2143 out:
2144         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2145                              &cached_state);
2146 out_page:
2147         unlock_page(page);
2148         put_page(page);
2149         kfree(fixup);
2150         extent_changeset_free(data_reserved);
2151 }
2152
2153 /*
2154  * There are a few paths in the higher layers of the kernel that directly
2155  * set the page dirty bit without asking the filesystem if it is a
2156  * good idea.  This causes problems because we want to make sure COW
2157  * properly happens and the data=ordered rules are followed.
2158  *
2159  * In our case any range that doesn't have the ORDERED bit set
2160  * hasn't been properly setup for IO.  We kick off an async process
2161  * to fix it up.  The async helper will wait for ordered extents, set
2162  * the delalloc bit and make it safe to write the page.
2163  */
2164 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2165 {
2166         struct inode *inode = page->mapping->host;
2167         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2168         struct btrfs_writepage_fixup *fixup;
2169
2170         /* this page is properly in the ordered list */
2171         if (TestClearPagePrivate2(page))
2172                 return 0;
2173
2174         if (PageChecked(page))
2175                 return -EAGAIN;
2176
2177         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2178         if (!fixup)
2179                 return -EAGAIN;
2180
2181         SetPageChecked(page);
2182         get_page(page);
2183         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2184                         btrfs_writepage_fixup_worker, NULL, NULL);
2185         fixup->page = page;
2186         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2187         return -EBUSY;
2188 }
2189
2190 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2191                                        struct inode *inode, u64 file_pos,
2192                                        u64 disk_bytenr, u64 disk_num_bytes,
2193                                        u64 num_bytes, u64 ram_bytes,
2194                                        u8 compression, u8 encryption,
2195                                        u16 other_encoding, int extent_type)
2196 {
2197         struct btrfs_root *root = BTRFS_I(inode)->root;
2198         struct btrfs_file_extent_item *fi;
2199         struct btrfs_path *path;
2200         struct extent_buffer *leaf;
2201         struct btrfs_key ins;
2202         u64 qg_released;
2203         int extent_inserted = 0;
2204         int ret;
2205
2206         path = btrfs_alloc_path();
2207         if (!path)
2208                 return -ENOMEM;
2209
2210         /*
2211          * we may be replacing one extent in the tree with another.
2212          * The new extent is pinned in the extent map, and we don't want
2213          * to drop it from the cache until it is completely in the btree.
2214          *
2215          * So, tell btrfs_drop_extents to leave this extent in the cache.
2216          * the caller is expected to unpin it and allow it to be merged
2217          * with the others.
2218          */
2219         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2220                                    file_pos + num_bytes, NULL, 0,
2221                                    1, sizeof(*fi), &extent_inserted);
2222         if (ret)
2223                 goto out;
2224
2225         if (!extent_inserted) {
2226                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2227                 ins.offset = file_pos;
2228                 ins.type = BTRFS_EXTENT_DATA_KEY;
2229
2230                 path->leave_spinning = 1;
2231                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2232                                               sizeof(*fi));
2233                 if (ret)
2234                         goto out;
2235         }
2236         leaf = path->nodes[0];
2237         fi = btrfs_item_ptr(leaf, path->slots[0],
2238                             struct btrfs_file_extent_item);
2239         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2240         btrfs_set_file_extent_type(leaf, fi, extent_type);
2241         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2242         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2243         btrfs_set_file_extent_offset(leaf, fi, 0);
2244         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2245         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2246         btrfs_set_file_extent_compression(leaf, fi, compression);
2247         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2248         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2249
2250         btrfs_mark_buffer_dirty(leaf);
2251         btrfs_release_path(path);
2252
2253         inode_add_bytes(inode, num_bytes);
2254
2255         ins.objectid = disk_bytenr;
2256         ins.offset = disk_num_bytes;
2257         ins.type = BTRFS_EXTENT_ITEM_KEY;
2258
2259         /*
2260          * Release the reserved range from inode dirty range map, as it is
2261          * already moved into delayed_ref_head
2262          */
2263         ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2264         if (ret < 0)
2265                 goto out;
2266         qg_released = ret;
2267         ret = btrfs_alloc_reserved_file_extent(trans, root,
2268                                                btrfs_ino(BTRFS_I(inode)),
2269                                                file_pos, qg_released, &ins);
2270 out:
2271         btrfs_free_path(path);
2272
2273         return ret;
2274 }
2275
2276 /* snapshot-aware defrag */
2277 struct sa_defrag_extent_backref {
2278         struct rb_node node;
2279         struct old_sa_defrag_extent *old;
2280         u64 root_id;
2281         u64 inum;
2282         u64 file_pos;
2283         u64 extent_offset;
2284         u64 num_bytes;
2285         u64 generation;
2286 };
2287
2288 struct old_sa_defrag_extent {
2289         struct list_head list;
2290         struct new_sa_defrag_extent *new;
2291
2292         u64 extent_offset;
2293         u64 bytenr;
2294         u64 offset;
2295         u64 len;
2296         int count;
2297 };
2298
2299 struct new_sa_defrag_extent {
2300         struct rb_root root;
2301         struct list_head head;
2302         struct btrfs_path *path;
2303         struct inode *inode;
2304         u64 file_pos;
2305         u64 len;
2306         u64 bytenr;
2307         u64 disk_len;
2308         u8 compress_type;
2309 };
2310
2311 static int backref_comp(struct sa_defrag_extent_backref *b1,
2312                         struct sa_defrag_extent_backref *b2)
2313 {
2314         if (b1->root_id < b2->root_id)
2315                 return -1;
2316         else if (b1->root_id > b2->root_id)
2317                 return 1;
2318
2319         if (b1->inum < b2->inum)
2320                 return -1;
2321         else if (b1->inum > b2->inum)
2322                 return 1;
2323
2324         if (b1->file_pos < b2->file_pos)
2325                 return -1;
2326         else if (b1->file_pos > b2->file_pos)
2327                 return 1;
2328
2329         /*
2330          * [------------------------------] ===> (a range of space)
2331          *     |<--->|   |<---->| =============> (fs/file tree A)
2332          * |<---------------------------->| ===> (fs/file tree B)
2333          *
2334          * A range of space can refer to two file extents in one tree while
2335          * refer to only one file extent in another tree.
2336          *
2337          * So we may process a disk offset more than one time(two extents in A)
2338          * and locate at the same extent(one extent in B), then insert two same
2339          * backrefs(both refer to the extent in B).
2340          */
2341         return 0;
2342 }
2343
2344 static void backref_insert(struct rb_root *root,
2345                            struct sa_defrag_extent_backref *backref)
2346 {
2347         struct rb_node **p = &root->rb_node;
2348         struct rb_node *parent = NULL;
2349         struct sa_defrag_extent_backref *entry;
2350         int ret;
2351
2352         while (*p) {
2353                 parent = *p;
2354                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2355
2356                 ret = backref_comp(backref, entry);
2357                 if (ret < 0)
2358                         p = &(*p)->rb_left;
2359                 else
2360                         p = &(*p)->rb_right;
2361         }
2362
2363         rb_link_node(&backref->node, parent, p);
2364         rb_insert_color(&backref->node, root);
2365 }
2366
2367 /*
2368  * Note the backref might has changed, and in this case we just return 0.
2369  */
2370 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2371                                        void *ctx)
2372 {
2373         struct btrfs_file_extent_item *extent;
2374         struct old_sa_defrag_extent *old = ctx;
2375         struct new_sa_defrag_extent *new = old->new;
2376         struct btrfs_path *path = new->path;
2377         struct btrfs_key key;
2378         struct btrfs_root *root;
2379         struct sa_defrag_extent_backref *backref;
2380         struct extent_buffer *leaf;
2381         struct inode *inode = new->inode;
2382         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2383         int slot;
2384         int ret;
2385         u64 extent_offset;
2386         u64 num_bytes;
2387
2388         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2389             inum == btrfs_ino(BTRFS_I(inode)))
2390                 return 0;
2391
2392         key.objectid = root_id;
2393         key.type = BTRFS_ROOT_ITEM_KEY;
2394         key.offset = (u64)-1;
2395
2396         root = btrfs_read_fs_root_no_name(fs_info, &key);
2397         if (IS_ERR(root)) {
2398                 if (PTR_ERR(root) == -ENOENT)
2399                         return 0;
2400                 WARN_ON(1);
2401                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2402                          inum, offset, root_id);
2403                 return PTR_ERR(root);
2404         }
2405
2406         key.objectid = inum;
2407         key.type = BTRFS_EXTENT_DATA_KEY;
2408         if (offset > (u64)-1 << 32)
2409                 key.offset = 0;
2410         else
2411                 key.offset = offset;
2412
2413         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2414         if (WARN_ON(ret < 0))
2415                 return ret;
2416         ret = 0;
2417
2418         while (1) {
2419                 cond_resched();
2420
2421                 leaf = path->nodes[0];
2422                 slot = path->slots[0];
2423
2424                 if (slot >= btrfs_header_nritems(leaf)) {
2425                         ret = btrfs_next_leaf(root, path);
2426                         if (ret < 0) {
2427                                 goto out;
2428                         } else if (ret > 0) {
2429                                 ret = 0;
2430                                 goto out;
2431                         }
2432                         continue;
2433                 }
2434
2435                 path->slots[0]++;
2436
2437                 btrfs_item_key_to_cpu(leaf, &key, slot);
2438
2439                 if (key.objectid > inum)
2440                         goto out;
2441
2442                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2443                         continue;
2444
2445                 extent = btrfs_item_ptr(leaf, slot,
2446                                         struct btrfs_file_extent_item);
2447
2448                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2449                         continue;
2450
2451                 /*
2452                  * 'offset' refers to the exact key.offset,
2453                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2454                  * (key.offset - extent_offset).
2455                  */
2456                 if (key.offset != offset)
2457                         continue;
2458
2459                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2460                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2461
2462                 if (extent_offset >= old->extent_offset + old->offset +
2463                     old->len || extent_offset + num_bytes <=
2464                     old->extent_offset + old->offset)
2465                         continue;
2466                 break;
2467         }
2468
2469         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2470         if (!backref) {
2471                 ret = -ENOENT;
2472                 goto out;
2473         }
2474
2475         backref->root_id = root_id;
2476         backref->inum = inum;
2477         backref->file_pos = offset;
2478         backref->num_bytes = num_bytes;
2479         backref->extent_offset = extent_offset;
2480         backref->generation = btrfs_file_extent_generation(leaf, extent);
2481         backref->old = old;
2482         backref_insert(&new->root, backref);
2483         old->count++;
2484 out:
2485         btrfs_release_path(path);
2486         WARN_ON(ret);
2487         return ret;
2488 }
2489
2490 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2491                                    struct new_sa_defrag_extent *new)
2492 {
2493         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2494         struct old_sa_defrag_extent *old, *tmp;
2495         int ret;
2496
2497         new->path = path;
2498
2499         list_for_each_entry_safe(old, tmp, &new->head, list) {
2500                 ret = iterate_inodes_from_logical(old->bytenr +
2501                                                   old->extent_offset, fs_info,
2502                                                   path, record_one_backref,
2503                                                   old, false);
2504                 if (ret < 0 && ret != -ENOENT)
2505                         return false;
2506
2507                 /* no backref to be processed for this extent */
2508                 if (!old->count) {
2509                         list_del(&old->list);
2510                         kfree(old);
2511                 }
2512         }
2513
2514         if (list_empty(&new->head))
2515                 return false;
2516
2517         return true;
2518 }
2519
2520 static int relink_is_mergable(struct extent_buffer *leaf,
2521                               struct btrfs_file_extent_item *fi,
2522                               struct new_sa_defrag_extent *new)
2523 {
2524         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2525                 return 0;
2526
2527         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2528                 return 0;
2529
2530         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2531                 return 0;
2532
2533         if (btrfs_file_extent_encryption(leaf, fi) ||
2534             btrfs_file_extent_other_encoding(leaf, fi))
2535                 return 0;
2536
2537         return 1;
2538 }
2539
2540 /*
2541  * Note the backref might has changed, and in this case we just return 0.
2542  */
2543 static noinline int relink_extent_backref(struct btrfs_path *path,
2544                                  struct sa_defrag_extent_backref *prev,
2545                                  struct sa_defrag_extent_backref *backref)
2546 {
2547         struct btrfs_file_extent_item *extent;
2548         struct btrfs_file_extent_item *item;
2549         struct btrfs_ordered_extent *ordered;
2550         struct btrfs_trans_handle *trans;
2551         struct btrfs_root *root;
2552         struct btrfs_key key;
2553         struct extent_buffer *leaf;
2554         struct old_sa_defrag_extent *old = backref->old;
2555         struct new_sa_defrag_extent *new = old->new;
2556         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2557         struct inode *inode;
2558         struct extent_state *cached = NULL;
2559         int ret = 0;
2560         u64 start;
2561         u64 len;
2562         u64 lock_start;
2563         u64 lock_end;
2564         bool merge = false;
2565         int index;
2566
2567         if (prev && prev->root_id == backref->root_id &&
2568             prev->inum == backref->inum &&
2569             prev->file_pos + prev->num_bytes == backref->file_pos)
2570                 merge = true;
2571
2572         /* step 1: get root */
2573         key.objectid = backref->root_id;
2574         key.type = BTRFS_ROOT_ITEM_KEY;
2575         key.offset = (u64)-1;
2576
2577         index = srcu_read_lock(&fs_info->subvol_srcu);
2578
2579         root = btrfs_read_fs_root_no_name(fs_info, &key);
2580         if (IS_ERR(root)) {
2581                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2582                 if (PTR_ERR(root) == -ENOENT)
2583                         return 0;
2584                 return PTR_ERR(root);
2585         }
2586
2587         if (btrfs_root_readonly(root)) {
2588                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2589                 return 0;
2590         }
2591
2592         /* step 2: get inode */
2593         key.objectid = backref->inum;
2594         key.type = BTRFS_INODE_ITEM_KEY;
2595         key.offset = 0;
2596
2597         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2598         if (IS_ERR(inode)) {
2599                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2600                 return 0;
2601         }
2602
2603         srcu_read_unlock(&fs_info->subvol_srcu, index);
2604
2605         /* step 3: relink backref */
2606         lock_start = backref->file_pos;
2607         lock_end = backref->file_pos + backref->num_bytes - 1;
2608         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2609                          &cached);
2610
2611         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2612         if (ordered) {
2613                 btrfs_put_ordered_extent(ordered);
2614                 goto out_unlock;
2615         }
2616
2617         trans = btrfs_join_transaction(root);
2618         if (IS_ERR(trans)) {
2619                 ret = PTR_ERR(trans);
2620                 goto out_unlock;
2621         }
2622
2623         key.objectid = backref->inum;
2624         key.type = BTRFS_EXTENT_DATA_KEY;
2625         key.offset = backref->file_pos;
2626
2627         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2628         if (ret < 0) {
2629                 goto out_free_path;
2630         } else if (ret > 0) {
2631                 ret = 0;
2632                 goto out_free_path;
2633         }
2634
2635         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2636                                 struct btrfs_file_extent_item);
2637
2638         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2639             backref->generation)
2640                 goto out_free_path;
2641
2642         btrfs_release_path(path);
2643
2644         start = backref->file_pos;
2645         if (backref->extent_offset < old->extent_offset + old->offset)
2646                 start += old->extent_offset + old->offset -
2647                          backref->extent_offset;
2648
2649         len = min(backref->extent_offset + backref->num_bytes,
2650                   old->extent_offset + old->offset + old->len);
2651         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2652
2653         ret = btrfs_drop_extents(trans, root, inode, start,
2654                                  start + len, 1);
2655         if (ret)
2656                 goto out_free_path;
2657 again:
2658         key.objectid = btrfs_ino(BTRFS_I(inode));
2659         key.type = BTRFS_EXTENT_DATA_KEY;
2660         key.offset = start;
2661
2662         path->leave_spinning = 1;
2663         if (merge) {
2664                 struct btrfs_file_extent_item *fi;
2665                 u64 extent_len;
2666                 struct btrfs_key found_key;
2667
2668                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2669                 if (ret < 0)
2670                         goto out_free_path;
2671
2672                 path->slots[0]--;
2673                 leaf = path->nodes[0];
2674                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2675
2676                 fi = btrfs_item_ptr(leaf, path->slots[0],
2677                                     struct btrfs_file_extent_item);
2678                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2679
2680                 if (extent_len + found_key.offset == start &&
2681                     relink_is_mergable(leaf, fi, new)) {
2682                         btrfs_set_file_extent_num_bytes(leaf, fi,
2683                                                         extent_len + len);
2684                         btrfs_mark_buffer_dirty(leaf);
2685                         inode_add_bytes(inode, len);
2686
2687                         ret = 1;
2688                         goto out_free_path;
2689                 } else {
2690                         merge = false;
2691                         btrfs_release_path(path);
2692                         goto again;
2693                 }
2694         }
2695
2696         ret = btrfs_insert_empty_item(trans, root, path, &key,
2697                                         sizeof(*extent));
2698         if (ret) {
2699                 btrfs_abort_transaction(trans, ret);
2700                 goto out_free_path;
2701         }
2702
2703         leaf = path->nodes[0];
2704         item = btrfs_item_ptr(leaf, path->slots[0],
2705                                 struct btrfs_file_extent_item);
2706         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2707         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2708         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2709         btrfs_set_file_extent_num_bytes(leaf, item, len);
2710         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2711         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2712         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2713         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2714         btrfs_set_file_extent_encryption(leaf, item, 0);
2715         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2716
2717         btrfs_mark_buffer_dirty(leaf);
2718         inode_add_bytes(inode, len);
2719         btrfs_release_path(path);
2720
2721         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2722                         new->disk_len, 0,
2723                         backref->root_id, backref->inum,
2724                         new->file_pos); /* start - extent_offset */
2725         if (ret) {
2726                 btrfs_abort_transaction(trans, ret);
2727                 goto out_free_path;
2728         }
2729
2730         ret = 1;
2731 out_free_path:
2732         btrfs_release_path(path);
2733         path->leave_spinning = 0;
2734         btrfs_end_transaction(trans);
2735 out_unlock:
2736         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2737                              &cached);
2738         iput(inode);
2739         return ret;
2740 }
2741
2742 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2743 {
2744         struct old_sa_defrag_extent *old, *tmp;
2745
2746         if (!new)
2747                 return;
2748
2749         list_for_each_entry_safe(old, tmp, &new->head, list) {
2750                 kfree(old);
2751         }
2752         kfree(new);
2753 }
2754
2755 static void relink_file_extents(struct new_sa_defrag_extent *new)
2756 {
2757         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2758         struct btrfs_path *path;
2759         struct sa_defrag_extent_backref *backref;
2760         struct sa_defrag_extent_backref *prev = NULL;
2761         struct inode *inode;
2762         struct rb_node *node;
2763         int ret;
2764
2765         inode = new->inode;
2766
2767         path = btrfs_alloc_path();
2768         if (!path)
2769                 return;
2770
2771         if (!record_extent_backrefs(path, new)) {
2772                 btrfs_free_path(path);
2773                 goto out;
2774         }
2775         btrfs_release_path(path);
2776
2777         while (1) {
2778                 node = rb_first(&new->root);
2779                 if (!node)
2780                         break;
2781                 rb_erase(node, &new->root);
2782
2783                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2784
2785                 ret = relink_extent_backref(path, prev, backref);
2786                 WARN_ON(ret < 0);
2787
2788                 kfree(prev);
2789
2790                 if (ret == 1)
2791                         prev = backref;
2792                 else
2793                         prev = NULL;
2794                 cond_resched();
2795         }
2796         kfree(prev);
2797
2798         btrfs_free_path(path);
2799 out:
2800         free_sa_defrag_extent(new);
2801
2802         atomic_dec(&fs_info->defrag_running);
2803         wake_up(&fs_info->transaction_wait);
2804 }
2805
2806 static struct new_sa_defrag_extent *
2807 record_old_file_extents(struct inode *inode,
2808                         struct btrfs_ordered_extent *ordered)
2809 {
2810         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2811         struct btrfs_root *root = BTRFS_I(inode)->root;
2812         struct btrfs_path *path;
2813         struct btrfs_key key;
2814         struct old_sa_defrag_extent *old;
2815         struct new_sa_defrag_extent *new;
2816         int ret;
2817
2818         new = kmalloc(sizeof(*new), GFP_NOFS);
2819         if (!new)
2820                 return NULL;
2821
2822         new->inode = inode;
2823         new->file_pos = ordered->file_offset;
2824         new->len = ordered->len;
2825         new->bytenr = ordered->start;
2826         new->disk_len = ordered->disk_len;
2827         new->compress_type = ordered->compress_type;
2828         new->root = RB_ROOT;
2829         INIT_LIST_HEAD(&new->head);
2830
2831         path = btrfs_alloc_path();
2832         if (!path)
2833                 goto out_kfree;
2834
2835         key.objectid = btrfs_ino(BTRFS_I(inode));
2836         key.type = BTRFS_EXTENT_DATA_KEY;
2837         key.offset = new->file_pos;
2838
2839         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2840         if (ret < 0)
2841                 goto out_free_path;
2842         if (ret > 0 && path->slots[0] > 0)
2843                 path->slots[0]--;
2844
2845         /* find out all the old extents for the file range */
2846         while (1) {
2847                 struct btrfs_file_extent_item *extent;
2848                 struct extent_buffer *l;
2849                 int slot;
2850                 u64 num_bytes;
2851                 u64 offset;
2852                 u64 end;
2853                 u64 disk_bytenr;
2854                 u64 extent_offset;
2855
2856                 l = path->nodes[0];
2857                 slot = path->slots[0];
2858
2859                 if (slot >= btrfs_header_nritems(l)) {
2860                         ret = btrfs_next_leaf(root, path);
2861                         if (ret < 0)
2862                                 goto out_free_path;
2863                         else if (ret > 0)
2864                                 break;
2865                         continue;
2866                 }
2867
2868                 btrfs_item_key_to_cpu(l, &key, slot);
2869
2870                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2871                         break;
2872                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2873                         break;
2874                 if (key.offset >= new->file_pos + new->len)
2875                         break;
2876
2877                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2878
2879                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2880                 if (key.offset + num_bytes < new->file_pos)
2881                         goto next;
2882
2883                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2884                 if (!disk_bytenr)
2885                         goto next;
2886
2887                 extent_offset = btrfs_file_extent_offset(l, extent);
2888
2889                 old = kmalloc(sizeof(*old), GFP_NOFS);
2890                 if (!old)
2891                         goto out_free_path;
2892
2893                 offset = max(new->file_pos, key.offset);
2894                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2895
2896                 old->bytenr = disk_bytenr;
2897                 old->extent_offset = extent_offset;
2898                 old->offset = offset - key.offset;
2899                 old->len = end - offset;
2900                 old->new = new;
2901                 old->count = 0;
2902                 list_add_tail(&old->list, &new->head);
2903 next:
2904                 path->slots[0]++;
2905                 cond_resched();
2906         }
2907
2908         btrfs_free_path(path);
2909         atomic_inc(&fs_info->defrag_running);
2910
2911         return new;
2912
2913 out_free_path:
2914         btrfs_free_path(path);
2915 out_kfree:
2916         free_sa_defrag_extent(new);
2917         return NULL;
2918 }
2919
2920 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2921                                          u64 start, u64 len)
2922 {
2923         struct btrfs_block_group_cache *cache;
2924
2925         cache = btrfs_lookup_block_group(fs_info, start);
2926         ASSERT(cache);
2927
2928         spin_lock(&cache->lock);
2929         cache->delalloc_bytes -= len;
2930         spin_unlock(&cache->lock);
2931
2932         btrfs_put_block_group(cache);
2933 }
2934
2935 /* as ordered data IO finishes, this gets called so we can finish
2936  * an ordered extent if the range of bytes in the file it covers are
2937  * fully written.
2938  */
2939 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2940 {
2941         struct inode *inode = ordered_extent->inode;
2942         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2943         struct btrfs_root *root = BTRFS_I(inode)->root;
2944         struct btrfs_trans_handle *trans = NULL;
2945         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2946         struct extent_state *cached_state = NULL;
2947         struct new_sa_defrag_extent *new = NULL;
2948         int compress_type = 0;
2949         int ret = 0;
2950         u64 logical_len = ordered_extent->len;
2951         bool nolock;
2952         bool truncated = false;
2953         bool range_locked = false;
2954         bool clear_new_delalloc_bytes = false;
2955
2956         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2957             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2958             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2959                 clear_new_delalloc_bytes = true;
2960
2961         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2962
2963         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2964                 ret = -EIO;
2965                 goto out;
2966         }
2967
2968         btrfs_free_io_failure_record(BTRFS_I(inode),
2969                         ordered_extent->file_offset,
2970                         ordered_extent->file_offset +
2971                         ordered_extent->len - 1);
2972
2973         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2974                 truncated = true;
2975                 logical_len = ordered_extent->truncated_len;
2976                 /* Truncated the entire extent, don't bother adding */
2977                 if (!logical_len)
2978                         goto out;
2979         }
2980
2981         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2982                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2983
2984                 /*
2985                  * For mwrite(mmap + memset to write) case, we still reserve
2986                  * space for NOCOW range.
2987                  * As NOCOW won't cause a new delayed ref, just free the space
2988                  */
2989                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2990                                        ordered_extent->len);
2991                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2992                 if (nolock)
2993                         trans = btrfs_join_transaction_nolock(root);
2994                 else
2995                         trans = btrfs_join_transaction(root);
2996                 if (IS_ERR(trans)) {
2997                         ret = PTR_ERR(trans);
2998                         trans = NULL;
2999                         goto out;
3000                 }
3001                 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3002                 ret = btrfs_update_inode_fallback(trans, root, inode);
3003                 if (ret) /* -ENOMEM or corruption */
3004                         btrfs_abort_transaction(trans, ret);
3005                 goto out;
3006         }
3007
3008         range_locked = true;
3009         lock_extent_bits(io_tree, ordered_extent->file_offset,
3010                          ordered_extent->file_offset + ordered_extent->len - 1,
3011                          &cached_state);
3012
3013         ret = test_range_bit(io_tree, ordered_extent->file_offset,
3014                         ordered_extent->file_offset + ordered_extent->len - 1,
3015                         EXTENT_DEFRAG, 0, cached_state);
3016         if (ret) {
3017                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3018                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3019                         /* the inode is shared */
3020                         new = record_old_file_extents(inode, ordered_extent);
3021
3022                 clear_extent_bit(io_tree, ordered_extent->file_offset,
3023                         ordered_extent->file_offset + ordered_extent->len - 1,
3024                         EXTENT_DEFRAG, 0, 0, &cached_state);
3025         }
3026
3027         if (nolock)
3028                 trans = btrfs_join_transaction_nolock(root);
3029         else
3030                 trans = btrfs_join_transaction(root);
3031         if (IS_ERR(trans)) {
3032                 ret = PTR_ERR(trans);
3033                 trans = NULL;
3034                 goto out;
3035         }
3036
3037         trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3038
3039         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3040                 compress_type = ordered_extent->compress_type;
3041         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3042                 BUG_ON(compress_type);
3043                 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3044                                        ordered_extent->len);
3045                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3046                                                 ordered_extent->file_offset,
3047                                                 ordered_extent->file_offset +
3048                                                 logical_len);
3049         } else {
3050                 BUG_ON(root == fs_info->tree_root);
3051                 ret = insert_reserved_file_extent(trans, inode,
3052                                                 ordered_extent->file_offset,
3053                                                 ordered_extent->start,
3054                                                 ordered_extent->disk_len,
3055                                                 logical_len, logical_len,
3056                                                 compress_type, 0, 0,
3057                                                 BTRFS_FILE_EXTENT_REG);
3058                 if (!ret)
3059                         btrfs_release_delalloc_bytes(fs_info,
3060                                                      ordered_extent->start,
3061                                                      ordered_extent->disk_len);
3062         }
3063         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3064                            ordered_extent->file_offset, ordered_extent->len,
3065                            trans->transid);
3066         if (ret < 0) {
3067                 btrfs_abort_transaction(trans, ret);
3068                 goto out;
3069         }
3070
3071         ret = add_pending_csums(trans, inode, &ordered_extent->list);
3072         if (ret) {
3073                 btrfs_abort_transaction(trans, ret);
3074                 goto out;
3075         }
3076
3077         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3078         ret = btrfs_update_inode_fallback(trans, root, inode);
3079         if (ret) { /* -ENOMEM or corruption */
3080                 btrfs_abort_transaction(trans, ret);
3081                 goto out;
3082         }
3083         ret = 0;
3084 out:
3085         if (range_locked || clear_new_delalloc_bytes) {
3086                 unsigned int clear_bits = 0;
3087
3088                 if (range_locked)
3089                         clear_bits |= EXTENT_LOCKED;
3090                 if (clear_new_delalloc_bytes)
3091                         clear_bits |= EXTENT_DELALLOC_NEW;
3092                 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3093                                  ordered_extent->file_offset,
3094                                  ordered_extent->file_offset +
3095                                  ordered_extent->len - 1,
3096                                  clear_bits,
3097                                  (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3098                                  0, &cached_state);
3099         }
3100
3101         if (trans)
3102                 btrfs_end_transaction(trans);
3103
3104         if (ret || truncated) {
3105                 u64 start, end;
3106
3107                 if (truncated)
3108                         start = ordered_extent->file_offset + logical_len;
3109                 else
3110                         start = ordered_extent->file_offset;
3111                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3112                 clear_extent_uptodate(io_tree, start, end, NULL);
3113
3114                 /* Drop the cache for the part of the extent we didn't write. */
3115                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3116
3117                 /*
3118                  * If the ordered extent had an IOERR or something else went
3119                  * wrong we need to return the space for this ordered extent
3120                  * back to the allocator.  We only free the extent in the
3121                  * truncated case if we didn't write out the extent at all.
3122                  */
3123                 if ((ret || !logical_len) &&
3124                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3125                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3126                         btrfs_free_reserved_extent(fs_info,
3127                                                    ordered_extent->start,
3128                                                    ordered_extent->disk_len, 1);
3129         }
3130
3131
3132         /*
3133          * This needs to be done to make sure anybody waiting knows we are done
3134          * updating everything for this ordered extent.
3135          */
3136         btrfs_remove_ordered_extent(inode, ordered_extent);
3137
3138         /* for snapshot-aware defrag */
3139         if (new) {
3140                 if (ret) {
3141                         free_sa_defrag_extent(new);
3142                         atomic_dec(&fs_info->defrag_running);
3143                 } else {
3144                         relink_file_extents(new);
3145                 }
3146         }
3147
3148         /* once for us */
3149         btrfs_put_ordered_extent(ordered_extent);
3150         /* once for the tree */
3151         btrfs_put_ordered_extent(ordered_extent);
3152
3153         return ret;
3154 }
3155
3156 static void finish_ordered_fn(struct btrfs_work *work)
3157 {
3158         struct btrfs_ordered_extent *ordered_extent;
3159         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3160         btrfs_finish_ordered_io(ordered_extent);
3161 }
3162
3163 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3164                                 struct extent_state *state, int uptodate)
3165 {
3166         struct inode *inode = page->mapping->host;
3167         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3168         struct btrfs_ordered_extent *ordered_extent = NULL;
3169         struct btrfs_workqueue *wq;
3170         btrfs_work_func_t func;
3171
3172         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3173
3174         ClearPagePrivate2(page);
3175         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3176                                             end - start + 1, uptodate))
3177                 return;
3178
3179         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3180                 wq = fs_info->endio_freespace_worker;
3181                 func = btrfs_freespace_write_helper;
3182         } else {
3183                 wq = fs_info->endio_write_workers;
3184                 func = btrfs_endio_write_helper;
3185         }
3186
3187         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3188                         NULL);
3189         btrfs_queue_work(wq, &ordered_extent->work);
3190 }
3191
3192 static int __readpage_endio_check(struct inode *inode,
3193                                   struct btrfs_io_bio *io_bio,
3194                                   int icsum, struct page *page,
3195                                   int pgoff, u64 start, size_t len)
3196 {
3197         char *kaddr;
3198         u32 csum_expected;
3199         u32 csum = ~(u32)0;
3200
3201         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3202
3203         kaddr = kmap_atomic(page);
3204         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3205         btrfs_csum_final(csum, (u8 *)&csum);
3206         if (csum != csum_expected)
3207                 goto zeroit;
3208
3209         kunmap_atomic(kaddr);
3210         return 0;
3211 zeroit:
3212         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3213                                     io_bio->mirror_num);
3214         memset(kaddr + pgoff, 1, len);
3215         flush_dcache_page(page);
3216         kunmap_atomic(kaddr);
3217         return -EIO;
3218 }
3219
3220 /*
3221  * when reads are done, we need to check csums to verify the data is correct
3222  * if there's a match, we allow the bio to finish.  If not, the code in
3223  * extent_io.c will try to find good copies for us.
3224  */