Merge branch 'for-linus-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
[muen/linux.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include "ctree.h"
46 #include "disk-io.h"
47 #include "transaction.h"
48 #include "btrfs_inode.h"
49 #include "print-tree.h"
50 #include "ordered-data.h"
51 #include "xattr.h"
52 #include "tree-log.h"
53 #include "volumes.h"
54 #include "compression.h"
55 #include "locking.h"
56 #include "free-space-cache.h"
57 #include "inode-map.h"
58 #include "backref.h"
59 #include "hash.h"
60 #include "props.h"
61 #include "qgroup.h"
62 #include "dedupe.h"
63
64 struct btrfs_iget_args {
65         struct btrfs_key *location;
66         struct btrfs_root *root;
67 };
68
69 struct btrfs_dio_data {
70         u64 outstanding_extents;
71         u64 reserve;
72         u64 unsubmitted_oe_range_start;
73         u64 unsubmitted_oe_range_end;
74         int overwrite;
75 };
76
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_transaction_cachep;
90 struct kmem_cache *btrfs_path_cachep;
91 struct kmem_cache *btrfs_free_space_cachep;
92
93 #define S_SHIFT 12
94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
96         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
97         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
98         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
99         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
100         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
101         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
102 };
103
104 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105 static int btrfs_truncate(struct inode *inode);
106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107 static noinline int cow_file_range(struct inode *inode,
108                                    struct page *locked_page,
109                                    u64 start, u64 end, u64 delalloc_end,
110                                    int *page_started, unsigned long *nr_written,
111                                    int unlock, struct btrfs_dedupe_hash *hash);
112 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
113                                        u64 orig_start, u64 block_start,
114                                        u64 block_len, u64 orig_block_len,
115                                        u64 ram_bytes, int compress_type,
116                                        int type);
117
118 static int btrfs_dirty_inode(struct inode *inode);
119
120 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
121 void btrfs_test_inode_set_ops(struct inode *inode)
122 {
123         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
124 }
125 #endif
126
127 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
128                                      struct inode *inode,  struct inode *dir,
129                                      const struct qstr *qstr)
130 {
131         int err;
132
133         err = btrfs_init_acl(trans, inode, dir);
134         if (!err)
135                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
136         return err;
137 }
138
139 /*
140  * this does all the hard work for inserting an inline extent into
141  * the btree.  The caller should have done a btrfs_drop_extents so that
142  * no overlapping inline items exist in the btree
143  */
144 static int insert_inline_extent(struct btrfs_trans_handle *trans,
145                                 struct btrfs_path *path, int extent_inserted,
146                                 struct btrfs_root *root, struct inode *inode,
147                                 u64 start, size_t size, size_t compressed_size,
148                                 int compress_type,
149                                 struct page **compressed_pages)
150 {
151         struct extent_buffer *leaf;
152         struct page *page = NULL;
153         char *kaddr;
154         unsigned long ptr;
155         struct btrfs_file_extent_item *ei;
156         int err = 0;
157         int ret;
158         size_t cur_size = size;
159         unsigned long offset;
160
161         if (compressed_size && compressed_pages)
162                 cur_size = compressed_size;
163
164         inode_add_bytes(inode, size);
165
166         if (!extent_inserted) {
167                 struct btrfs_key key;
168                 size_t datasize;
169
170                 key.objectid = btrfs_ino(BTRFS_I(inode));
171                 key.offset = start;
172                 key.type = BTRFS_EXTENT_DATA_KEY;
173
174                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
175                 path->leave_spinning = 1;
176                 ret = btrfs_insert_empty_item(trans, root, path, &key,
177                                               datasize);
178                 if (ret) {
179                         err = ret;
180                         goto fail;
181                 }
182         }
183         leaf = path->nodes[0];
184         ei = btrfs_item_ptr(leaf, path->slots[0],
185                             struct btrfs_file_extent_item);
186         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
187         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
188         btrfs_set_file_extent_encryption(leaf, ei, 0);
189         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
190         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
191         ptr = btrfs_file_extent_inline_start(ei);
192
193         if (compress_type != BTRFS_COMPRESS_NONE) {
194                 struct page *cpage;
195                 int i = 0;
196                 while (compressed_size > 0) {
197                         cpage = compressed_pages[i];
198                         cur_size = min_t(unsigned long, compressed_size,
199                                        PAGE_SIZE);
200
201                         kaddr = kmap_atomic(cpage);
202                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
203                         kunmap_atomic(kaddr);
204
205                         i++;
206                         ptr += cur_size;
207                         compressed_size -= cur_size;
208                 }
209                 btrfs_set_file_extent_compression(leaf, ei,
210                                                   compress_type);
211         } else {
212                 page = find_get_page(inode->i_mapping,
213                                      start >> PAGE_SHIFT);
214                 btrfs_set_file_extent_compression(leaf, ei, 0);
215                 kaddr = kmap_atomic(page);
216                 offset = start & (PAGE_SIZE - 1);
217                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
218                 kunmap_atomic(kaddr);
219                 put_page(page);
220         }
221         btrfs_mark_buffer_dirty(leaf);
222         btrfs_release_path(path);
223
224         /*
225          * we're an inline extent, so nobody can
226          * extend the file past i_size without locking
227          * a page we already have locked.
228          *
229          * We must do any isize and inode updates
230          * before we unlock the pages.  Otherwise we
231          * could end up racing with unlink.
232          */
233         BTRFS_I(inode)->disk_i_size = inode->i_size;
234         ret = btrfs_update_inode(trans, root, inode);
235
236         return ret;
237 fail:
238         return err;
239 }
240
241
242 /*
243  * conditionally insert an inline extent into the file.  This
244  * does the checks required to make sure the data is small enough
245  * to fit as an inline extent.
246  */
247 static noinline int cow_file_range_inline(struct btrfs_root *root,
248                                           struct inode *inode, u64 start,
249                                           u64 end, size_t compressed_size,
250                                           int compress_type,
251                                           struct page **compressed_pages)
252 {
253         struct btrfs_fs_info *fs_info = root->fs_info;
254         struct btrfs_trans_handle *trans;
255         u64 isize = i_size_read(inode);
256         u64 actual_end = min(end + 1, isize);
257         u64 inline_len = actual_end - start;
258         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
259         u64 data_len = inline_len;
260         int ret;
261         struct btrfs_path *path;
262         int extent_inserted = 0;
263         u32 extent_item_size;
264
265         if (compressed_size)
266                 data_len = compressed_size;
267
268         if (start > 0 ||
269             actual_end > fs_info->sectorsize ||
270             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
271             (!compressed_size &&
272             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
273             end + 1 < isize ||
274             data_len > fs_info->max_inline) {
275                 return 1;
276         }
277
278         path = btrfs_alloc_path();
279         if (!path)
280                 return -ENOMEM;
281
282         trans = btrfs_join_transaction(root);
283         if (IS_ERR(trans)) {
284                 btrfs_free_path(path);
285                 return PTR_ERR(trans);
286         }
287         trans->block_rsv = &fs_info->delalloc_block_rsv;
288
289         if (compressed_size && compressed_pages)
290                 extent_item_size = btrfs_file_extent_calc_inline_size(
291                    compressed_size);
292         else
293                 extent_item_size = btrfs_file_extent_calc_inline_size(
294                     inline_len);
295
296         ret = __btrfs_drop_extents(trans, root, inode, path,
297                                    start, aligned_end, NULL,
298                                    1, 1, extent_item_size, &extent_inserted);
299         if (ret) {
300                 btrfs_abort_transaction(trans, ret);
301                 goto out;
302         }
303
304         if (isize > actual_end)
305                 inline_len = min_t(u64, isize, actual_end);
306         ret = insert_inline_extent(trans, path, extent_inserted,
307                                    root, inode, start,
308                                    inline_len, compressed_size,
309                                    compress_type, compressed_pages);
310         if (ret && ret != -ENOSPC) {
311                 btrfs_abort_transaction(trans, ret);
312                 goto out;
313         } else if (ret == -ENOSPC) {
314                 ret = 1;
315                 goto out;
316         }
317
318         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
319         btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
320         btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
321 out:
322         /*
323          * Don't forget to free the reserved space, as for inlined extent
324          * it won't count as data extent, free them directly here.
325          * And at reserve time, it's always aligned to page size, so
326          * just free one page here.
327          */
328         btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
329         btrfs_free_path(path);
330         btrfs_end_transaction(trans);
331         return ret;
332 }
333
334 struct async_extent {
335         u64 start;
336         u64 ram_size;
337         u64 compressed_size;
338         struct page **pages;
339         unsigned long nr_pages;
340         int compress_type;
341         struct list_head list;
342 };
343
344 struct async_cow {
345         struct inode *inode;
346         struct btrfs_root *root;
347         struct page *locked_page;
348         u64 start;
349         u64 end;
350         struct list_head extents;
351         struct btrfs_work work;
352 };
353
354 static noinline int add_async_extent(struct async_cow *cow,
355                                      u64 start, u64 ram_size,
356                                      u64 compressed_size,
357                                      struct page **pages,
358                                      unsigned long nr_pages,
359                                      int compress_type)
360 {
361         struct async_extent *async_extent;
362
363         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
364         BUG_ON(!async_extent); /* -ENOMEM */
365         async_extent->start = start;
366         async_extent->ram_size = ram_size;
367         async_extent->compressed_size = compressed_size;
368         async_extent->pages = pages;
369         async_extent->nr_pages = nr_pages;
370         async_extent->compress_type = compress_type;
371         list_add_tail(&async_extent->list, &cow->extents);
372         return 0;
373 }
374
375 static inline int inode_need_compress(struct inode *inode)
376 {
377         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
378
379         /* force compress */
380         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
381                 return 1;
382         /* bad compression ratios */
383         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
384                 return 0;
385         if (btrfs_test_opt(fs_info, COMPRESS) ||
386             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
387             BTRFS_I(inode)->force_compress)
388                 return 1;
389         return 0;
390 }
391
392 static inline void inode_should_defrag(struct btrfs_inode *inode,
393                 u64 start, u64 end, u64 num_bytes, u64 small_write)
394 {
395         /* If this is a small write inside eof, kick off a defrag */
396         if (num_bytes < small_write &&
397             (start > 0 || end + 1 < inode->disk_i_size))
398                 btrfs_add_inode_defrag(NULL, inode);
399 }
400
401 /*
402  * we create compressed extents in two phases.  The first
403  * phase compresses a range of pages that have already been
404  * locked (both pages and state bits are locked).
405  *
406  * This is done inside an ordered work queue, and the compression
407  * is spread across many cpus.  The actual IO submission is step
408  * two, and the ordered work queue takes care of making sure that
409  * happens in the same order things were put onto the queue by
410  * writepages and friends.
411  *
412  * If this code finds it can't get good compression, it puts an
413  * entry onto the work queue to write the uncompressed bytes.  This
414  * makes sure that both compressed inodes and uncompressed inodes
415  * are written in the same order that the flusher thread sent them
416  * down.
417  */
418 static noinline void compress_file_range(struct inode *inode,
419                                         struct page *locked_page,
420                                         u64 start, u64 end,
421                                         struct async_cow *async_cow,
422                                         int *num_added)
423 {
424         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
425         struct btrfs_root *root = BTRFS_I(inode)->root;
426         u64 num_bytes;
427         u64 blocksize = fs_info->sectorsize;
428         u64 actual_end;
429         u64 isize = i_size_read(inode);
430         int ret = 0;
431         struct page **pages = NULL;
432         unsigned long nr_pages;
433         unsigned long total_compressed = 0;
434         unsigned long total_in = 0;
435         int i;
436         int will_compress;
437         int compress_type = fs_info->compress_type;
438         int redirty = 0;
439
440         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
441                         SZ_16K);
442
443         actual_end = min_t(u64, isize, end + 1);
444 again:
445         will_compress = 0;
446         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
447         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
448         nr_pages = min_t(unsigned long, nr_pages,
449                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
450
451         /*
452          * we don't want to send crud past the end of i_size through
453          * compression, that's just a waste of CPU time.  So, if the
454          * end of the file is before the start of our current
455          * requested range of bytes, we bail out to the uncompressed
456          * cleanup code that can deal with all of this.
457          *
458          * It isn't really the fastest way to fix things, but this is a
459          * very uncommon corner.
460          */
461         if (actual_end <= start)
462                 goto cleanup_and_bail_uncompressed;
463
464         total_compressed = actual_end - start;
465
466         /*
467          * skip compression for a small file range(<=blocksize) that
468          * isn't an inline extent, since it doesn't save disk space at all.
469          */
470         if (total_compressed <= blocksize &&
471            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
472                 goto cleanup_and_bail_uncompressed;
473
474         total_compressed = min_t(unsigned long, total_compressed,
475                         BTRFS_MAX_UNCOMPRESSED);
476         num_bytes = ALIGN(end - start + 1, blocksize);
477         num_bytes = max(blocksize,  num_bytes);
478         total_in = 0;
479         ret = 0;
480
481         /*
482          * we do compression for mount -o compress and when the
483          * inode has not been flagged as nocompress.  This flag can
484          * change at any time if we discover bad compression ratios.
485          */
486         if (inode_need_compress(inode)) {
487                 WARN_ON(pages);
488                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
489                 if (!pages) {
490                         /* just bail out to the uncompressed code */
491                         goto cont;
492                 }
493
494                 if (BTRFS_I(inode)->force_compress)
495                         compress_type = BTRFS_I(inode)->force_compress;
496
497                 /*
498                  * we need to call clear_page_dirty_for_io on each
499                  * page in the range.  Otherwise applications with the file
500                  * mmap'd can wander in and change the page contents while
501                  * we are compressing them.
502                  *
503                  * If the compression fails for any reason, we set the pages
504                  * dirty again later on.
505                  */
506                 extent_range_clear_dirty_for_io(inode, start, end);
507                 redirty = 1;
508                 ret = btrfs_compress_pages(compress_type,
509                                            inode->i_mapping, start,
510                                            pages,
511                                            &nr_pages,
512                                            &total_in,
513                                            &total_compressed);
514
515                 if (!ret) {
516                         unsigned long offset = total_compressed &
517                                 (PAGE_SIZE - 1);
518                         struct page *page = pages[nr_pages - 1];
519                         char *kaddr;
520
521                         /* zero the tail end of the last page, we might be
522                          * sending it down to disk
523                          */
524                         if (offset) {
525                                 kaddr = kmap_atomic(page);
526                                 memset(kaddr + offset, 0,
527                                        PAGE_SIZE - offset);
528                                 kunmap_atomic(kaddr);
529                         }
530                         will_compress = 1;
531                 }
532         }
533 cont:
534         if (start == 0) {
535                 /* lets try to make an inline extent */
536                 if (ret || total_in < (actual_end - start)) {
537                         /* we didn't compress the entire range, try
538                          * to make an uncompressed inline extent.
539                          */
540                         ret = cow_file_range_inline(root, inode, start, end,
541                                             0, BTRFS_COMPRESS_NONE, NULL);
542                 } else {
543                         /* try making a compressed inline extent */
544                         ret = cow_file_range_inline(root, inode, start, end,
545                                                     total_compressed,
546                                                     compress_type, pages);
547                 }
548                 if (ret <= 0) {
549                         unsigned long clear_flags = EXTENT_DELALLOC |
550                                 EXTENT_DEFRAG;
551                         unsigned long page_error_op;
552
553                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
554                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
555
556                         /*
557                          * inline extent creation worked or returned error,
558                          * we don't need to create any more async work items.
559                          * Unlock and free up our temp pages.
560                          */
561                         extent_clear_unlock_delalloc(inode, start, end, end,
562                                                      NULL, clear_flags,
563                                                      PAGE_UNLOCK |
564                                                      PAGE_CLEAR_DIRTY |
565                                                      PAGE_SET_WRITEBACK |
566                                                      page_error_op |
567                                                      PAGE_END_WRITEBACK);
568                         btrfs_free_reserved_data_space_noquota(inode, start,
569                                                 end - start + 1);
570                         goto free_pages_out;
571                 }
572         }
573
574         if (will_compress) {
575                 /*
576                  * we aren't doing an inline extent round the compressed size
577                  * up to a block size boundary so the allocator does sane
578                  * things
579                  */
580                 total_compressed = ALIGN(total_compressed, blocksize);
581
582                 /*
583                  * one last check to make sure the compression is really a
584                  * win, compare the page count read with the blocks on disk
585                  */
586                 total_in = ALIGN(total_in, PAGE_SIZE);
587                 if (total_compressed >= total_in) {
588                         will_compress = 0;
589                 } else {
590                         num_bytes = total_in;
591                         *num_added += 1;
592
593                         /*
594                          * The async work queues will take care of doing actual
595                          * allocation on disk for these compressed pages, and
596                          * will submit them to the elevator.
597                          */
598                         add_async_extent(async_cow, start, num_bytes,
599                                         total_compressed, pages, nr_pages,
600                                         compress_type);
601
602                         if (start + num_bytes < end) {
603                                 start += num_bytes;
604                                 pages = NULL;
605                                 cond_resched();
606                                 goto again;
607                         }
608                         return;
609                 }
610         }
611         if (pages) {
612                 /*
613                  * the compression code ran but failed to make things smaller,
614                  * free any pages it allocated and our page pointer array
615                  */
616                 for (i = 0; i < nr_pages; i++) {
617                         WARN_ON(pages[i]->mapping);
618                         put_page(pages[i]);
619                 }
620                 kfree(pages);
621                 pages = NULL;
622                 total_compressed = 0;
623                 nr_pages = 0;
624
625                 /* flag the file so we don't compress in the future */
626                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
627                     !(BTRFS_I(inode)->force_compress)) {
628                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
629                 }
630         }
631 cleanup_and_bail_uncompressed:
632         /*
633          * No compression, but we still need to write the pages in the file
634          * we've been given so far.  redirty the locked page if it corresponds
635          * to our extent and set things up for the async work queue to run
636          * cow_file_range to do the normal delalloc dance.
637          */
638         if (page_offset(locked_page) >= start &&
639             page_offset(locked_page) <= end)
640                 __set_page_dirty_nobuffers(locked_page);
641                 /* unlocked later on in the async handlers */
642
643         if (redirty)
644                 extent_range_redirty_for_io(inode, start, end);
645         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
646                          BTRFS_COMPRESS_NONE);
647         *num_added += 1;
648
649         return;
650
651 free_pages_out:
652         for (i = 0; i < nr_pages; i++) {
653                 WARN_ON(pages[i]->mapping);
654                 put_page(pages[i]);
655         }
656         kfree(pages);
657 }
658
659 static void free_async_extent_pages(struct async_extent *async_extent)
660 {
661         int i;
662
663         if (!async_extent->pages)
664                 return;
665
666         for (i = 0; i < async_extent->nr_pages; i++) {
667                 WARN_ON(async_extent->pages[i]->mapping);
668                 put_page(async_extent->pages[i]);
669         }
670         kfree(async_extent->pages);
671         async_extent->nr_pages = 0;
672         async_extent->pages = NULL;
673 }
674
675 /*
676  * phase two of compressed writeback.  This is the ordered portion
677  * of the code, which only gets called in the order the work was
678  * queued.  We walk all the async extents created by compress_file_range
679  * and send them down to the disk.
680  */
681 static noinline void submit_compressed_extents(struct inode *inode,
682                                               struct async_cow *async_cow)
683 {
684         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
685         struct async_extent *async_extent;
686         u64 alloc_hint = 0;
687         struct btrfs_key ins;
688         struct extent_map *em;
689         struct btrfs_root *root = BTRFS_I(inode)->root;
690         struct extent_io_tree *io_tree;
691         int ret = 0;
692
693 again:
694         while (!list_empty(&async_cow->extents)) {
695                 async_extent = list_entry(async_cow->extents.next,
696                                           struct async_extent, list);
697                 list_del(&async_extent->list);
698
699                 io_tree = &BTRFS_I(inode)->io_tree;
700
701 retry:
702                 /* did the compression code fall back to uncompressed IO? */
703                 if (!async_extent->pages) {
704                         int page_started = 0;
705                         unsigned long nr_written = 0;
706
707                         lock_extent(io_tree, async_extent->start,
708                                          async_extent->start +
709                                          async_extent->ram_size - 1);
710
711                         /* allocate blocks */
712                         ret = cow_file_range(inode, async_cow->locked_page,
713                                              async_extent->start,
714                                              async_extent->start +
715                                              async_extent->ram_size - 1,
716                                              async_extent->start +
717                                              async_extent->ram_size - 1,
718                                              &page_started, &nr_written, 0,
719                                              NULL);
720
721                         /* JDM XXX */
722
723                         /*
724                          * if page_started, cow_file_range inserted an
725                          * inline extent and took care of all the unlocking
726                          * and IO for us.  Otherwise, we need to submit
727                          * all those pages down to the drive.
728                          */
729                         if (!page_started && !ret)
730                                 extent_write_locked_range(io_tree,
731                                                   inode, async_extent->start,
732                                                   async_extent->start +
733                                                   async_extent->ram_size - 1,
734                                                   btrfs_get_extent,
735                                                   WB_SYNC_ALL);
736                         else if (ret)
737                                 unlock_page(async_cow->locked_page);
738                         kfree(async_extent);
739                         cond_resched();
740                         continue;
741                 }
742
743                 lock_extent(io_tree, async_extent->start,
744                             async_extent->start + async_extent->ram_size - 1);
745
746                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
747                                            async_extent->compressed_size,
748                                            async_extent->compressed_size,
749                                            0, alloc_hint, &ins, 1, 1);
750                 if (ret) {
751                         free_async_extent_pages(async_extent);
752
753                         if (ret == -ENOSPC) {
754                                 unlock_extent(io_tree, async_extent->start,
755                                               async_extent->start +
756                                               async_extent->ram_size - 1);
757
758                                 /*
759                                  * we need to redirty the pages if we decide to
760                                  * fallback to uncompressed IO, otherwise we
761                                  * will not submit these pages down to lower
762                                  * layers.
763                                  */
764                                 extent_range_redirty_for_io(inode,
765                                                 async_extent->start,
766                                                 async_extent->start +
767                                                 async_extent->ram_size - 1);
768
769                                 goto retry;
770                         }
771                         goto out_free;
772                 }
773                 /*
774                  * here we're doing allocation and writeback of the
775                  * compressed pages
776                  */
777                 em = create_io_em(inode, async_extent->start,
778                                   async_extent->ram_size, /* len */
779                                   async_extent->start, /* orig_start */
780                                   ins.objectid, /* block_start */
781                                   ins.offset, /* block_len */
782                                   ins.offset, /* orig_block_len */
783                                   async_extent->ram_size, /* ram_bytes */
784                                   async_extent->compress_type,
785                                   BTRFS_ORDERED_COMPRESSED);
786                 if (IS_ERR(em))
787                         /* ret value is not necessary due to void function */
788                         goto out_free_reserve;
789                 free_extent_map(em);
790
791                 ret = btrfs_add_ordered_extent_compress(inode,
792                                                 async_extent->start,
793                                                 ins.objectid,
794                                                 async_extent->ram_size,
795                                                 ins.offset,
796                                                 BTRFS_ORDERED_COMPRESSED,
797                                                 async_extent->compress_type);
798                 if (ret) {
799                         btrfs_drop_extent_cache(BTRFS_I(inode),
800                                                 async_extent->start,
801                                                 async_extent->start +
802                                                 async_extent->ram_size - 1, 0);
803                         goto out_free_reserve;
804                 }
805                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
806
807                 /*
808                  * clear dirty, set writeback and unlock the pages.
809                  */
810                 extent_clear_unlock_delalloc(inode, async_extent->start,
811                                 async_extent->start +
812                                 async_extent->ram_size - 1,
813                                 async_extent->start +
814                                 async_extent->ram_size - 1,
815                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
816                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
817                                 PAGE_SET_WRITEBACK);
818                 ret = btrfs_submit_compressed_write(inode,
819                                     async_extent->start,
820                                     async_extent->ram_size,
821                                     ins.objectid,
822                                     ins.offset, async_extent->pages,
823                                     async_extent->nr_pages);
824                 if (ret) {
825                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
826                         struct page *p = async_extent->pages[0];
827                         const u64 start = async_extent->start;
828                         const u64 end = start + async_extent->ram_size - 1;
829
830                         p->mapping = inode->i_mapping;
831                         tree->ops->writepage_end_io_hook(p, start, end,
832                                                          NULL, 0);
833                         p->mapping = NULL;
834                         extent_clear_unlock_delalloc(inode, start, end, end,
835                                                      NULL, 0,
836                                                      PAGE_END_WRITEBACK |
837                                                      PAGE_SET_ERROR);
838                         free_async_extent_pages(async_extent);
839                 }
840                 alloc_hint = ins.objectid + ins.offset;
841                 kfree(async_extent);
842                 cond_resched();
843         }
844         return;
845 out_free_reserve:
846         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
847         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
848 out_free:
849         extent_clear_unlock_delalloc(inode, async_extent->start,
850                                      async_extent->start +
851                                      async_extent->ram_size - 1,
852                                      async_extent->start +
853                                      async_extent->ram_size - 1,
854                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
855                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
856                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
857                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
858                                      PAGE_SET_ERROR);
859         free_async_extent_pages(async_extent);
860         kfree(async_extent);
861         goto again;
862 }
863
864 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
865                                       u64 num_bytes)
866 {
867         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
868         struct extent_map *em;
869         u64 alloc_hint = 0;
870
871         read_lock(&em_tree->lock);
872         em = search_extent_mapping(em_tree, start, num_bytes);
873         if (em) {
874                 /*
875                  * if block start isn't an actual block number then find the
876                  * first block in this inode and use that as a hint.  If that
877                  * block is also bogus then just don't worry about it.
878                  */
879                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
880                         free_extent_map(em);
881                         em = search_extent_mapping(em_tree, 0, 0);
882                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
883                                 alloc_hint = em->block_start;
884                         if (em)
885                                 free_extent_map(em);
886                 } else {
887                         alloc_hint = em->block_start;
888                         free_extent_map(em);
889                 }
890         }
891         read_unlock(&em_tree->lock);
892
893         return alloc_hint;
894 }
895
896 /*
897  * when extent_io.c finds a delayed allocation range in the file,
898  * the call backs end up in this code.  The basic idea is to
899  * allocate extents on disk for the range, and create ordered data structs
900  * in ram to track those extents.
901  *
902  * locked_page is the page that writepage had locked already.  We use
903  * it to make sure we don't do extra locks or unlocks.
904  *
905  * *page_started is set to one if we unlock locked_page and do everything
906  * required to start IO on it.  It may be clean and already done with
907  * IO when we return.
908  */
909 static noinline int cow_file_range(struct inode *inode,
910                                    struct page *locked_page,
911                                    u64 start, u64 end, u64 delalloc_end,
912                                    int *page_started, unsigned long *nr_written,
913                                    int unlock, struct btrfs_dedupe_hash *hash)
914 {
915         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
916         struct btrfs_root *root = BTRFS_I(inode)->root;
917         u64 alloc_hint = 0;
918         u64 num_bytes;
919         unsigned long ram_size;
920         u64 disk_num_bytes;
921         u64 cur_alloc_size;
922         u64 blocksize = fs_info->sectorsize;
923         struct btrfs_key ins;
924         struct extent_map *em;
925         int ret = 0;
926
927         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
928                 WARN_ON_ONCE(1);
929                 ret = -EINVAL;
930                 goto out_unlock;
931         }
932
933         num_bytes = ALIGN(end - start + 1, blocksize);
934         num_bytes = max(blocksize,  num_bytes);
935         disk_num_bytes = num_bytes;
936
937         inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
938
939         if (start == 0) {
940                 /* lets try to make an inline extent */
941                 ret = cow_file_range_inline(root, inode, start, end, 0,
942                                         BTRFS_COMPRESS_NONE, NULL);
943                 if (ret == 0) {
944                         extent_clear_unlock_delalloc(inode, start, end,
945                                      delalloc_end, NULL,
946                                      EXTENT_LOCKED | EXTENT_DELALLOC |
947                                      EXTENT_DEFRAG, PAGE_UNLOCK |
948                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
949                                      PAGE_END_WRITEBACK);
950                         btrfs_free_reserved_data_space_noquota(inode, start,
951                                                 end - start + 1);
952                         *nr_written = *nr_written +
953                              (end - start + PAGE_SIZE) / PAGE_SIZE;
954                         *page_started = 1;
955                         goto out;
956                 } else if (ret < 0) {
957                         goto out_unlock;
958                 }
959         }
960
961         BUG_ON(disk_num_bytes >
962                btrfs_super_total_bytes(fs_info->super_copy));
963
964         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
965         btrfs_drop_extent_cache(BTRFS_I(inode), start,
966                         start + num_bytes - 1, 0);
967
968         while (disk_num_bytes > 0) {
969                 unsigned long op;
970
971                 cur_alloc_size = disk_num_bytes;
972                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
973                                            fs_info->sectorsize, 0, alloc_hint,
974                                            &ins, 1, 1);
975                 if (ret < 0)
976                         goto out_unlock;
977
978                 ram_size = ins.offset;
979                 em = create_io_em(inode, start, ins.offset, /* len */
980                                   start, /* orig_start */
981                                   ins.objectid, /* block_start */
982                                   ins.offset, /* block_len */
983                                   ins.offset, /* orig_block_len */
984                                   ram_size, /* ram_bytes */
985                                   BTRFS_COMPRESS_NONE, /* compress_type */
986                                   BTRFS_ORDERED_REGULAR /* type */);
987                 if (IS_ERR(em))
988                         goto out_reserve;
989                 free_extent_map(em);
990
991                 cur_alloc_size = ins.offset;
992                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
993                                                ram_size, cur_alloc_size, 0);
994                 if (ret)
995                         goto out_drop_extent_cache;
996
997                 if (root->root_key.objectid ==
998                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
999                         ret = btrfs_reloc_clone_csums(inode, start,
1000                                                       cur_alloc_size);
1001                         if (ret)
1002                                 goto out_drop_extent_cache;
1003                 }
1004
1005                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1006
1007                 if (disk_num_bytes < cur_alloc_size)
1008                         break;
1009
1010                 /* we're not doing compressed IO, don't unlock the first
1011                  * page (which the caller expects to stay locked), don't
1012                  * clear any dirty bits and don't set any writeback bits
1013                  *
1014                  * Do set the Private2 bit so we know this page was properly
1015                  * setup for writepage
1016                  */
1017                 op = unlock ? PAGE_UNLOCK : 0;
1018                 op |= PAGE_SET_PRIVATE2;
1019
1020                 extent_clear_unlock_delalloc(inode, start,
1021                                              start + ram_size - 1,
1022                                              delalloc_end, locked_page,
1023                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1024                                              op);
1025                 disk_num_bytes -= cur_alloc_size;
1026                 num_bytes -= cur_alloc_size;
1027                 alloc_hint = ins.objectid + ins.offset;
1028                 start += cur_alloc_size;
1029         }
1030 out:
1031         return ret;
1032
1033 out_drop_extent_cache:
1034         btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1035 out_reserve:
1036         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1037         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1038 out_unlock:
1039         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1040                                      locked_page,
1041                                      EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1042                                      EXTENT_DELALLOC | EXTENT_DEFRAG,
1043                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1044                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1045         goto out;
1046 }
1047
1048 /*
1049  * work queue call back to started compression on a file and pages
1050  */
1051 static noinline void async_cow_start(struct btrfs_work *work)
1052 {
1053         struct async_cow *async_cow;
1054         int num_added = 0;
1055         async_cow = container_of(work, struct async_cow, work);
1056
1057         compress_file_range(async_cow->inode, async_cow->locked_page,
1058                             async_cow->start, async_cow->end, async_cow,
1059                             &num_added);
1060         if (num_added == 0) {
1061                 btrfs_add_delayed_iput(async_cow->inode);
1062                 async_cow->inode = NULL;
1063         }
1064 }
1065
1066 /*
1067  * work queue call back to submit previously compressed pages
1068  */
1069 static noinline void async_cow_submit(struct btrfs_work *work)
1070 {
1071         struct btrfs_fs_info *fs_info;
1072         struct async_cow *async_cow;
1073         struct btrfs_root *root;
1074         unsigned long nr_pages;
1075
1076         async_cow = container_of(work, struct async_cow, work);
1077
1078         root = async_cow->root;
1079         fs_info = root->fs_info;
1080         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1081                 PAGE_SHIFT;
1082
1083         /*
1084          * atomic_sub_return implies a barrier for waitqueue_active
1085          */
1086         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1087             5 * SZ_1M &&
1088             waitqueue_active(&fs_info->async_submit_wait))
1089                 wake_up(&fs_info->async_submit_wait);
1090
1091         if (async_cow->inode)
1092                 submit_compressed_extents(async_cow->inode, async_cow);
1093 }
1094
1095 static noinline void async_cow_free(struct btrfs_work *work)
1096 {
1097         struct async_cow *async_cow;
1098         async_cow = container_of(work, struct async_cow, work);
1099         if (async_cow->inode)
1100                 btrfs_add_delayed_iput(async_cow->inode);
1101         kfree(async_cow);
1102 }
1103
1104 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1105                                 u64 start, u64 end, int *page_started,
1106                                 unsigned long *nr_written)
1107 {
1108         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1109         struct async_cow *async_cow;
1110         struct btrfs_root *root = BTRFS_I(inode)->root;
1111         unsigned long nr_pages;
1112         u64 cur_end;
1113
1114         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1115                          1, 0, NULL, GFP_NOFS);
1116         while (start < end) {
1117                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1118                 BUG_ON(!async_cow); /* -ENOMEM */
1119                 async_cow->inode = igrab(inode);
1120                 async_cow->root = root;
1121                 async_cow->locked_page = locked_page;
1122                 async_cow->start = start;
1123
1124                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1125                     !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1126                         cur_end = end;
1127                 else
1128                         cur_end = min(end, start + SZ_512K - 1);
1129
1130                 async_cow->end = cur_end;
1131                 INIT_LIST_HEAD(&async_cow->extents);
1132
1133                 btrfs_init_work(&async_cow->work,
1134                                 btrfs_delalloc_helper,
1135                                 async_cow_start, async_cow_submit,
1136                                 async_cow_free);
1137
1138                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1139                         PAGE_SHIFT;
1140                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1141
1142                 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1143
1144                 while (atomic_read(&fs_info->async_submit_draining) &&
1145                        atomic_read(&fs_info->async_delalloc_pages)) {
1146                         wait_event(fs_info->async_submit_wait,
1147                                    (atomic_read(&fs_info->async_delalloc_pages) ==
1148                                     0));
1149                 }
1150
1151                 *nr_written += nr_pages;
1152                 start = cur_end + 1;
1153         }
1154         *page_started = 1;
1155         return 0;
1156 }
1157
1158 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1159                                         u64 bytenr, u64 num_bytes)
1160 {
1161         int ret;
1162         struct btrfs_ordered_sum *sums;
1163         LIST_HEAD(list);
1164
1165         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1166                                        bytenr + num_bytes - 1, &list, 0);
1167         if (ret == 0 && list_empty(&list))
1168                 return 0;
1169
1170         while (!list_empty(&list)) {
1171                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1172                 list_del(&sums->list);
1173                 kfree(sums);
1174         }
1175         return 1;
1176 }
1177
1178 /*
1179  * when nowcow writeback call back.  This checks for snapshots or COW copies
1180  * of the extents that exist in the file, and COWs the file as required.
1181  *
1182  * If no cow copies or snapshots exist, we write directly to the existing
1183  * blocks on disk
1184  */
1185 static noinline int run_delalloc_nocow(struct inode *inode,
1186                                        struct page *locked_page,
1187                               u64 start, u64 end, int *page_started, int force,
1188                               unsigned long *nr_written)
1189 {
1190         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1191         struct btrfs_root *root = BTRFS_I(inode)->root;
1192         struct extent_buffer *leaf;
1193         struct btrfs_path *path;
1194         struct btrfs_file_extent_item *fi;
1195         struct btrfs_key found_key;
1196         struct extent_map *em;
1197         u64 cow_start;
1198         u64 cur_offset;
1199         u64 extent_end;
1200         u64 extent_offset;
1201         u64 disk_bytenr;
1202         u64 num_bytes;
1203         u64 disk_num_bytes;
1204         u64 ram_bytes;
1205         int extent_type;
1206         int ret, err;
1207         int type;
1208         int nocow;
1209         int check_prev = 1;
1210         bool nolock;
1211         u64 ino = btrfs_ino(BTRFS_I(inode));
1212
1213         path = btrfs_alloc_path();
1214         if (!path) {
1215                 extent_clear_unlock_delalloc(inode, start, end, end,
1216                                              locked_page,
1217                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1218                                              EXTENT_DO_ACCOUNTING |
1219                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1220                                              PAGE_CLEAR_DIRTY |
1221                                              PAGE_SET_WRITEBACK |
1222                                              PAGE_END_WRITEBACK);
1223                 return -ENOMEM;
1224         }
1225
1226         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1227
1228         cow_start = (u64)-1;
1229         cur_offset = start;
1230         while (1) {
1231                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1232                                                cur_offset, 0);
1233                 if (ret < 0)
1234                         goto error;
1235                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1236                         leaf = path->nodes[0];
1237                         btrfs_item_key_to_cpu(leaf, &found_key,
1238                                               path->slots[0] - 1);
1239                         if (found_key.objectid == ino &&
1240                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1241                                 path->slots[0]--;
1242                 }
1243                 check_prev = 0;
1244 next_slot:
1245                 leaf = path->nodes[0];
1246                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1247                         ret = btrfs_next_leaf(root, path);
1248                         if (ret < 0)
1249                                 goto error;
1250                         if (ret > 0)
1251                                 break;
1252                         leaf = path->nodes[0];
1253                 }
1254
1255                 nocow = 0;
1256                 disk_bytenr = 0;
1257                 num_bytes = 0;
1258                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1259
1260                 if (found_key.objectid > ino)
1261                         break;
1262                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1263                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1264                         path->slots[0]++;
1265                         goto next_slot;
1266                 }
1267                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1268                     found_key.offset > end)
1269                         break;
1270
1271                 if (found_key.offset > cur_offset) {
1272                         extent_end = found_key.offset;
1273                         extent_type = 0;
1274                         goto out_check;
1275                 }
1276
1277                 fi = btrfs_item_ptr(leaf, path->slots[0],
1278                                     struct btrfs_file_extent_item);
1279                 extent_type = btrfs_file_extent_type(leaf, fi);
1280
1281                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1282                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1283                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1284                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1285                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1286                         extent_end = found_key.offset +
1287                                 btrfs_file_extent_num_bytes(leaf, fi);
1288                         disk_num_bytes =
1289                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1290                         if (extent_end <= start) {
1291                                 path->slots[0]++;
1292                                 goto next_slot;
1293                         }
1294                         if (disk_bytenr == 0)
1295                                 goto out_check;
1296                         if (btrfs_file_extent_compression(leaf, fi) ||
1297                             btrfs_file_extent_encryption(leaf, fi) ||
1298                             btrfs_file_extent_other_encoding(leaf, fi))
1299                                 goto out_check;
1300                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1301                                 goto out_check;
1302                         if (btrfs_extent_readonly(fs_info, disk_bytenr))
1303                                 goto out_check;
1304                         if (btrfs_cross_ref_exist(root, ino,
1305                                                   found_key.offset -
1306                                                   extent_offset, disk_bytenr))
1307                                 goto out_check;
1308                         disk_bytenr += extent_offset;
1309                         disk_bytenr += cur_offset - found_key.offset;
1310                         num_bytes = min(end + 1, extent_end) - cur_offset;
1311                         /*
1312                          * if there are pending snapshots for this root,
1313                          * we fall into common COW way.
1314                          */
1315                         if (!nolock) {
1316                                 err = btrfs_start_write_no_snapshoting(root);
1317                                 if (!err)
1318                                         goto out_check;
1319                         }
1320                         /*
1321                          * force cow if csum exists in the range.
1322                          * this ensure that csum for a given extent are
1323                          * either valid or do not exist.
1324                          */
1325                         if (csum_exist_in_range(fs_info, disk_bytenr,
1326                                                 num_bytes)) {
1327                                 if (!nolock)
1328                                         btrfs_end_write_no_snapshoting(root);
1329                                 goto out_check;
1330                         }
1331                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1332                                 if (!nolock)
1333                                         btrfs_end_write_no_snapshoting(root);
1334                                 goto out_check;
1335                         }
1336                         nocow = 1;
1337                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1338                         extent_end = found_key.offset +
1339                                 btrfs_file_extent_inline_len(leaf,
1340                                                      path->slots[0], fi);
1341                         extent_end = ALIGN(extent_end,
1342                                            fs_info->sectorsize);
1343                 } else {
1344                         BUG_ON(1);
1345                 }
1346 out_check:
1347                 if (extent_end <= start) {
1348                         path->slots[0]++;
1349                         if (!nolock && nocow)
1350                                 btrfs_end_write_no_snapshoting(root);
1351                         if (nocow)
1352                                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1353                         goto next_slot;
1354                 }
1355                 if (!nocow) {
1356                         if (cow_start == (u64)-1)
1357                                 cow_start = cur_offset;
1358                         cur_offset = extent_end;
1359                         if (cur_offset > end)
1360                                 break;
1361                         path->slots[0]++;
1362                         goto next_slot;
1363                 }
1364
1365                 btrfs_release_path(path);
1366                 if (cow_start != (u64)-1) {
1367                         ret = cow_file_range(inode, locked_page,
1368                                              cow_start, found_key.offset - 1,
1369                                              end, page_started, nr_written, 1,
1370                                              NULL);
1371                         if (ret) {
1372                                 if (!nolock && nocow)
1373                                         btrfs_end_write_no_snapshoting(root);
1374                                 if (nocow)
1375                                         btrfs_dec_nocow_writers(fs_info,
1376                                                                 disk_bytenr);
1377                                 goto error;
1378                         }
1379                         cow_start = (u64)-1;
1380                 }
1381
1382                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1383                         u64 orig_start = found_key.offset - extent_offset;
1384
1385                         em = create_io_em(inode, cur_offset, num_bytes,
1386                                           orig_start,
1387                                           disk_bytenr, /* block_start */
1388                                           num_bytes, /* block_len */
1389                                           disk_num_bytes, /* orig_block_len */
1390                                           ram_bytes, BTRFS_COMPRESS_NONE,
1391                                           BTRFS_ORDERED_PREALLOC);
1392                         if (IS_ERR(em)) {
1393                                 if (!nolock && nocow)
1394                                         btrfs_end_write_no_snapshoting(root);
1395                                 if (nocow)
1396                                         btrfs_dec_nocow_writers(fs_info,
1397                                                                 disk_bytenr);
1398                                 ret = PTR_ERR(em);
1399                                 goto error;
1400                         }
1401                         free_extent_map(em);
1402                 }
1403
1404                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1405                         type = BTRFS_ORDERED_PREALLOC;
1406                 } else {
1407                         type = BTRFS_ORDERED_NOCOW;
1408                 }
1409
1410                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1411                                                num_bytes, num_bytes, type);
1412                 if (nocow)
1413                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1414                 BUG_ON(ret); /* -ENOMEM */
1415
1416                 if (root->root_key.objectid ==
1417                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1418                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1419                                                       num_bytes);
1420                         if (ret) {
1421                                 if (!nolock && nocow)
1422                                         btrfs_end_write_no_snapshoting(root);
1423                                 goto error;
1424                         }
1425                 }
1426
1427                 extent_clear_unlock_delalloc(inode, cur_offset,
1428                                              cur_offset + num_bytes - 1, end,
1429                                              locked_page, EXTENT_LOCKED |
1430                                              EXTENT_DELALLOC |
1431                                              EXTENT_CLEAR_DATA_RESV,
1432                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1433
1434                 if (!nolock && nocow)
1435                         btrfs_end_write_no_snapshoting(root);
1436                 cur_offset = extent_end;
1437                 if (cur_offset > end)
1438                         break;
1439         }
1440         btrfs_release_path(path);
1441
1442         if (cur_offset <= end && cow_start == (u64)-1) {
1443                 cow_start = cur_offset;
1444                 cur_offset = end;
1445         }
1446
1447         if (cow_start != (u64)-1) {
1448                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1449                                      page_started, nr_written, 1, NULL);
1450                 if (ret)
1451                         goto error;
1452         }
1453
1454 error:
1455         if (ret && cur_offset < end)
1456                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1457                                              locked_page, EXTENT_LOCKED |
1458                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1459                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1460                                              PAGE_CLEAR_DIRTY |
1461                                              PAGE_SET_WRITEBACK |
1462                                              PAGE_END_WRITEBACK);
1463         btrfs_free_path(path);
1464         return ret;
1465 }
1466
1467 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1468 {
1469
1470         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1471             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1472                 return 0;
1473
1474         /*
1475          * @defrag_bytes is a hint value, no spinlock held here,
1476          * if is not zero, it means the file is defragging.
1477          * Force cow if given extent needs to be defragged.
1478          */
1479         if (BTRFS_I(inode)->defrag_bytes &&
1480             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1481                            EXTENT_DEFRAG, 0, NULL))
1482                 return 1;
1483
1484         return 0;
1485 }
1486
1487 /*
1488  * extent_io.c call back to do delayed allocation processing
1489  */
1490 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1491                               u64 start, u64 end, int *page_started,
1492                               unsigned long *nr_written)
1493 {
1494         int ret;
1495         int force_cow = need_force_cow(inode, start, end);
1496
1497         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1498                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1499                                          page_started, 1, nr_written);
1500         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1501                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1502                                          page_started, 0, nr_written);
1503         } else if (!inode_need_compress(inode)) {
1504                 ret = cow_file_range(inode, locked_page, start, end, end,
1505                                       page_started, nr_written, 1, NULL);
1506         } else {
1507                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1508                         &BTRFS_I(inode)->runtime_flags);
1509                 ret = cow_file_range_async(inode, locked_page, start, end,
1510                                            page_started, nr_written);
1511         }
1512         return ret;
1513 }
1514
1515 static void btrfs_split_extent_hook(struct inode *inode,
1516                                     struct extent_state *orig, u64 split)
1517 {
1518         u64 size;
1519
1520         /* not delalloc, ignore it */
1521         if (!(orig->state & EXTENT_DELALLOC))
1522                 return;
1523
1524         size = orig->end - orig->start + 1;
1525         if (size > BTRFS_MAX_EXTENT_SIZE) {
1526                 u32 num_extents;
1527                 u64 new_size;
1528
1529                 /*
1530                  * See the explanation in btrfs_merge_extent_hook, the same
1531                  * applies here, just in reverse.
1532                  */
1533                 new_size = orig->end - split + 1;
1534                 num_extents = count_max_extents(new_size);
1535                 new_size = split - orig->start;
1536                 num_extents += count_max_extents(new_size);
1537                 if (count_max_extents(size) >= num_extents)
1538                         return;
1539         }
1540
1541         spin_lock(&BTRFS_I(inode)->lock);
1542         BTRFS_I(inode)->outstanding_extents++;
1543         spin_unlock(&BTRFS_I(inode)->lock);
1544 }
1545
1546 /*
1547  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1548  * extents so we can keep track of new extents that are just merged onto old
1549  * extents, such as when we are doing sequential writes, so we can properly
1550  * account for the metadata space we'll need.
1551  */
1552 static void btrfs_merge_extent_hook(struct inode *inode,
1553                                     struct extent_state *new,
1554                                     struct extent_state *other)
1555 {
1556         u64 new_size, old_size;
1557         u32 num_extents;
1558
1559         /* not delalloc, ignore it */
1560         if (!(other->state & EXTENT_DELALLOC))
1561                 return;
1562
1563         if (new->start > other->start)
1564                 new_size = new->end - other->start + 1;
1565         else
1566                 new_size = other->end - new->start + 1;
1567
1568         /* we're not bigger than the max, unreserve the space and go */
1569         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1570                 spin_lock(&BTRFS_I(inode)->lock);
1571                 BTRFS_I(inode)->outstanding_extents--;
1572                 spin_unlock(&BTRFS_I(inode)->lock);
1573                 return;
1574         }
1575
1576         /*
1577          * We have to add up either side to figure out how many extents were
1578          * accounted for before we merged into one big extent.  If the number of
1579          * extents we accounted for is <= the amount we need for the new range
1580          * then we can return, otherwise drop.  Think of it like this
1581          *
1582          * [ 4k][MAX_SIZE]
1583          *
1584          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1585          * need 2 outstanding extents, on one side we have 1 and the other side
1586          * we have 1 so they are == and we can return.  But in this case
1587          *
1588          * [MAX_SIZE+4k][MAX_SIZE+4k]
1589          *
1590          * Each range on their own accounts for 2 extents, but merged together
1591          * they are only 3 extents worth of accounting, so we need to drop in
1592          * this case.
1593          */
1594         old_size = other->end - other->start + 1;
1595         num_extents = count_max_extents(old_size);
1596         old_size = new->end - new->start + 1;
1597         num_extents += count_max_extents(old_size);
1598         if (count_max_extents(new_size) >= num_extents)
1599                 return;
1600
1601         spin_lock(&BTRFS_I(inode)->lock);
1602         BTRFS_I(inode)->outstanding_extents--;
1603         spin_unlock(&BTRFS_I(inode)->lock);
1604 }
1605
1606 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1607                                       struct inode *inode)
1608 {
1609         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1610
1611         spin_lock(&root->delalloc_lock);
1612         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1613                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1614                               &root->delalloc_inodes);
1615                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616                         &BTRFS_I(inode)->runtime_flags);
1617                 root->nr_delalloc_inodes++;
1618                 if (root->nr_delalloc_inodes == 1) {
1619                         spin_lock(&fs_info->delalloc_root_lock);
1620                         BUG_ON(!list_empty(&root->delalloc_root));
1621                         list_add_tail(&root->delalloc_root,
1622                                       &fs_info->delalloc_roots);
1623                         spin_unlock(&fs_info->delalloc_root_lock);
1624                 }
1625         }
1626         spin_unlock(&root->delalloc_lock);
1627 }
1628
1629 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1630                                      struct btrfs_inode *inode)
1631 {
1632         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1633
1634         spin_lock(&root->delalloc_lock);
1635         if (!list_empty(&inode->delalloc_inodes)) {
1636                 list_del_init(&inode->delalloc_inodes);
1637                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1638                           &inode->runtime_flags);
1639                 root->nr_delalloc_inodes--;
1640                 if (!root->nr_delalloc_inodes) {
1641                         spin_lock(&fs_info->delalloc_root_lock);
1642                         BUG_ON(list_empty(&root->delalloc_root));
1643                         list_del_init(&root->delalloc_root);
1644                         spin_unlock(&fs_info->delalloc_root_lock);
1645                 }
1646         }
1647         spin_unlock(&root->delalloc_lock);
1648 }
1649
1650 /*
1651  * extent_io.c set_bit_hook, used to track delayed allocation
1652  * bytes in this file, and to maintain the list of inodes that
1653  * have pending delalloc work to be done.
1654  */
1655 static void btrfs_set_bit_hook(struct inode *inode,
1656                                struct extent_state *state, unsigned *bits)
1657 {
1658
1659         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1660
1661         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1662                 WARN_ON(1);
1663         /*
1664          * set_bit and clear bit hooks normally require _irqsave/restore
1665          * but in this case, we are only testing for the DELALLOC
1666          * bit, which is only set or cleared with irqs on
1667          */
1668         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1669                 struct btrfs_root *root = BTRFS_I(inode)->root;
1670                 u64 len = state->end + 1 - state->start;
1671                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1672
1673                 if (*bits & EXTENT_FIRST_DELALLOC) {
1674                         *bits &= ~EXTENT_FIRST_DELALLOC;
1675                 } else {
1676                         spin_lock(&BTRFS_I(inode)->lock);
1677                         BTRFS_I(inode)->outstanding_extents++;
1678                         spin_unlock(&BTRFS_I(inode)->lock);
1679                 }
1680
1681                 /* For sanity tests */
1682                 if (btrfs_is_testing(fs_info))
1683                         return;
1684
1685                 __percpu_counter_add(&fs_info->delalloc_bytes, len,
1686                                      fs_info->delalloc_batch);
1687                 spin_lock(&BTRFS_I(inode)->lock);
1688                 BTRFS_I(inode)->delalloc_bytes += len;
1689                 if (*bits & EXTENT_DEFRAG)
1690                         BTRFS_I(inode)->defrag_bytes += len;
1691                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1692                                          &BTRFS_I(inode)->runtime_flags))
1693                         btrfs_add_delalloc_inodes(root, inode);
1694                 spin_unlock(&BTRFS_I(inode)->lock);
1695         }
1696 }
1697
1698 /*
1699  * extent_io.c clear_bit_hook, see set_bit_hook for why
1700  */
1701 static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
1702                                  struct extent_state *state,
1703                                  unsigned *bits)
1704 {
1705         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1706         u64 len = state->end + 1 - state->start;
1707         u32 num_extents = count_max_extents(len);
1708
1709         spin_lock(&inode->lock);
1710         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1711                 inode->defrag_bytes -= len;
1712         spin_unlock(&inode->lock);
1713
1714         /*
1715          * set_bit and clear bit hooks normally require _irqsave/restore
1716          * but in this case, we are only testing for the DELALLOC
1717          * bit, which is only set or cleared with irqs on
1718          */
1719         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1720                 struct btrfs_root *root = inode->root;
1721                 bool do_list = !btrfs_is_free_space_inode(inode);
1722
1723                 if (*bits & EXTENT_FIRST_DELALLOC) {
1724                         *bits &= ~EXTENT_FIRST_DELALLOC;
1725                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1726                         spin_lock(&inode->lock);
1727                         inode->outstanding_extents -= num_extents;
1728                         spin_unlock(&inode->lock);
1729                 }
1730
1731                 /*
1732                  * We don't reserve metadata space for space cache inodes so we
1733                  * don't need to call dellalloc_release_metadata if there is an
1734                  * error.
1735                  */
1736                 if (*bits & EXTENT_DO_ACCOUNTING &&
1737                     root != fs_info->tree_root)
1738                         btrfs_delalloc_release_metadata(inode, len);
1739
1740                 /* For sanity tests. */
1741                 if (btrfs_is_testing(fs_info))
1742                         return;
1743
1744                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1745                     && do_list && !(state->state & EXTENT_NORESERVE)
1746                     && (*bits & (EXTENT_DO_ACCOUNTING |
1747                     EXTENT_CLEAR_DATA_RESV)))
1748                         btrfs_free_reserved_data_space_noquota(
1749                                         &inode->vfs_inode,
1750                                         state->start, len);
1751
1752                 __percpu_counter_add(&fs_info->delalloc_bytes, -len,
1753                                      fs_info->delalloc_batch);
1754                 spin_lock(&inode->lock);
1755                 inode->delalloc_bytes -= len;
1756                 if (do_list && inode->delalloc_bytes == 0 &&
1757                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1758                                         &inode->runtime_flags))
1759                         btrfs_del_delalloc_inode(root, inode);
1760                 spin_unlock(&inode->lock);
1761         }
1762 }
1763
1764 /*
1765  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1766  * we don't create bios that span stripes or chunks
1767  *
1768  * return 1 if page cannot be merged to bio
1769  * return 0 if page can be merged to bio
1770  * return error otherwise
1771  */
1772 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1773                          size_t size, struct bio *bio,
1774                          unsigned long bio_flags)
1775 {
1776         struct inode *inode = page->mapping->host;
1777         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1778         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1779         u64 length = 0;
1780         u64 map_length;
1781         int ret;
1782
1783         if (bio_flags & EXTENT_BIO_COMPRESSED)
1784                 return 0;
1785
1786         length = bio->bi_iter.bi_size;
1787         map_length = length;
1788         ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1789                               NULL, 0);
1790         if (ret < 0)
1791                 return ret;
1792         if (map_length < length + size)
1793                 return 1;
1794         return 0;
1795 }
1796
1797 /*
1798  * in order to insert checksums into the metadata in large chunks,
1799  * we wait until bio submission time.   All the pages in the bio are
1800  * checksummed and sums are attached onto the ordered extent record.
1801  *
1802  * At IO completion time the cums attached on the ordered extent record
1803  * are inserted into the btree
1804  */
1805 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
1806                                     int mirror_num, unsigned long bio_flags,
1807                                     u64 bio_offset)
1808 {
1809         int ret = 0;
1810
1811         ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1812         BUG_ON(ret); /* -ENOMEM */
1813         return 0;
1814 }
1815
1816 /*
1817  * in order to insert checksums into the metadata in large chunks,
1818  * we wait until bio submission time.   All the pages in the bio are
1819  * checksummed and sums are attached onto the ordered extent record.
1820  *
1821  * At IO completion time the cums attached on the ordered extent record
1822  * are inserted into the btree
1823  */
1824 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
1825                           int mirror_num, unsigned long bio_flags,
1826                           u64 bio_offset)
1827 {
1828         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1829         int ret;
1830
1831         ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1832         if (ret) {
1833                 bio->bi_error = ret;
1834                 bio_endio(bio);
1835         }
1836         return ret;
1837 }
1838
1839 /*
1840  * extent_io.c submission hook. This does the right thing for csum calculation
1841  * on write, or reading the csums from the tree before a read
1842  */
1843 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1844                           int mirror_num, unsigned long bio_flags,
1845                           u64 bio_offset)
1846 {
1847         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1848         struct btrfs_root *root = BTRFS_I(inode)->root;
1849         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1850         int ret = 0;
1851         int skip_sum;
1852         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1853
1854         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1855
1856         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1857                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1858
1859         if (bio_op(bio) != REQ_OP_WRITE) {
1860                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1861                 if (ret)
1862                         goto out;
1863
1864                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1865                         ret = btrfs_submit_compressed_read(inode, bio,
1866                                                            mirror_num,
1867                                                            bio_flags);
1868                         goto out;
1869                 } else if (!skip_sum) {
1870                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1871                         if (ret)
1872                                 goto out;
1873                 }
1874                 goto mapit;
1875         } else if (async && !skip_sum) {
1876                 /* csum items have already been cloned */
1877                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1878                         goto mapit;
1879                 /* we're doing a write, do the async checksumming */
1880                 ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
1881                                           bio_flags, bio_offset,
1882                                           __btrfs_submit_bio_start,
1883                                           __btrfs_submit_bio_done);
1884                 goto out;
1885         } else if (!skip_sum) {
1886                 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1887                 if (ret)
1888                         goto out;
1889         }
1890
1891 mapit:
1892         ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
1893
1894 out:
1895         if (ret < 0) {
1896                 bio->bi_error = ret;
1897                 bio_endio(bio);
1898         }
1899         return ret;
1900 }
1901
1902 /*
1903  * given a list of ordered sums record them in the inode.  This happens
1904  * at IO completion time based on sums calculated at bio submission time.
1905  */
1906 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1907                              struct inode *inode, struct list_head *list)
1908 {
1909         struct btrfs_ordered_sum *sum;
1910
1911         list_for_each_entry(sum, list, list) {
1912                 trans->adding_csums = 1;
1913                 btrfs_csum_file_blocks(trans,
1914                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1915                 trans->adding_csums = 0;
1916         }
1917         return 0;
1918 }
1919
1920 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1921                               struct extent_state **cached_state, int dedupe)
1922 {
1923         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
1924         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1925                                    cached_state);
1926 }
1927
1928 /* see btrfs_writepage_start_hook for details on why this is required */
1929 struct btrfs_writepage_fixup {
1930         struct page *page;
1931         struct btrfs_work work;
1932 };
1933
1934 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1935 {
1936         struct btrfs_writepage_fixup *fixup;
1937         struct btrfs_ordered_extent *ordered;
1938         struct extent_state *cached_state = NULL;
1939         struct page *page;
1940         struct inode *inode;
1941         u64 page_start;
1942         u64 page_end;
1943         int ret;
1944
1945         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1946         page = fixup->page;
1947 again:
1948         lock_page(page);
1949         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1950                 ClearPageChecked(page);
1951                 goto out_page;
1952         }
1953
1954         inode = page->mapping->host;
1955         page_start = page_offset(page);
1956         page_end = page_offset(page) + PAGE_SIZE - 1;
1957
1958         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
1959                          &cached_state);
1960
1961         /* already ordered? We're done */
1962         if (PagePrivate2(page))
1963                 goto out;
1964
1965         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
1966                                         PAGE_SIZE);
1967         if (ordered) {
1968                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1969                                      page_end, &cached_state, GFP_NOFS);
1970                 unlock_page(page);
1971                 btrfs_start_ordered_extent(inode, ordered, 1);
1972                 btrfs_put_ordered_extent(ordered);
1973                 goto again;
1974         }
1975
1976         ret = btrfs_delalloc_reserve_space(inode, page_start,
1977                                            PAGE_SIZE);
1978         if (ret) {
1979                 mapping_set_error(page->mapping, ret);
1980                 end_extent_writepage(page, ret, page_start, page_end);
1981                 ClearPageChecked(page);
1982                 goto out;
1983          }
1984
1985         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
1986                                   0);
1987         ClearPageChecked(page);
1988         set_page_dirty(page);
1989 out:
1990         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1991                              &cached_state, GFP_NOFS);
1992 out_page:
1993         unlock_page(page);
1994         put_page(page);
1995         kfree(fixup);
1996 }
1997
1998 /*
1999  * There are a few paths in the higher layers of the kernel that directly
2000  * set the page dirty bit without asking the filesystem if it is a
2001  * good idea.  This causes problems because we want to make sure COW
2002  * properly happens and the data=ordered rules are followed.
2003  *
2004  * In our case any range that doesn't have the ORDERED bit set
2005  * hasn't been properly setup for IO.  We kick off an async process
2006  * to fix it up.  The async helper will wait for ordered extents, set
2007  * the delalloc bit and make it safe to write the page.
2008  */
2009 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2010 {
2011         struct inode *inode = page->mapping->host;
2012         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2013         struct btrfs_writepage_fixup *fixup;
2014
2015         /* this page is properly in the ordered list */
2016         if (TestClearPagePrivate2(page))
2017                 return 0;
2018
2019         if (PageChecked(page))
2020                 return -EAGAIN;
2021
2022         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2023         if (!fixup)
2024                 return -EAGAIN;
2025
2026         SetPageChecked(page);
2027         get_page(page);
2028         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2029                         btrfs_writepage_fixup_worker, NULL, NULL);
2030         fixup->page = page;
2031         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2032         return -EBUSY;
2033 }
2034
2035 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2036                                        struct inode *inode, u64 file_pos,
2037                                        u64 disk_bytenr, u64 disk_num_bytes,
2038                                        u64 num_bytes, u64 ram_bytes,
2039                                        u8 compression, u8 encryption,
2040                                        u16 other_encoding, int extent_type)
2041 {
2042         struct btrfs_root *root = BTRFS_I(inode)->root;
2043         struct btrfs_file_extent_item *fi;
2044         struct btrfs_path *path;
2045         struct extent_buffer *leaf;
2046         struct btrfs_key ins;
2047         int extent_inserted = 0;
2048         int ret;
2049
2050         path = btrfs_alloc_path();
2051         if (!path)
2052                 return -ENOMEM;
2053
2054         /*
2055          * we may be replacing one extent in the tree with another.
2056          * The new extent is pinned in the extent map, and we don't want
2057          * to drop it from the cache until it is completely in the btree.
2058          *
2059          * So, tell btrfs_drop_extents to leave this extent in the cache.
2060          * the caller is expected to unpin it and allow it to be merged
2061          * with the others.
2062          */
2063         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2064                                    file_pos + num_bytes, NULL, 0,
2065                                    1, sizeof(*fi), &extent_inserted);
2066         if (ret)
2067                 goto out;
2068
2069         if (!extent_inserted) {
2070                 ins.objectid = btrfs_ino(BTRFS_I(inode));
2071                 ins.offset = file_pos;
2072                 ins.type = BTRFS_EXTENT_DATA_KEY;
2073
2074                 path->leave_spinning = 1;
2075                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2076                                               sizeof(*fi));
2077                 if (ret)
2078                         goto out;
2079         }
2080         leaf = path->nodes[0];
2081         fi = btrfs_item_ptr(leaf, path->slots[0],
2082                             struct btrfs_file_extent_item);
2083         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2084         btrfs_set_file_extent_type(leaf, fi, extent_type);
2085         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2086         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2087         btrfs_set_file_extent_offset(leaf, fi, 0);
2088         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2089         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2090         btrfs_set_file_extent_compression(leaf, fi, compression);
2091         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2092         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2093
2094         btrfs_mark_buffer_dirty(leaf);
2095         btrfs_release_path(path);
2096
2097         inode_add_bytes(inode, num_bytes);
2098
2099         ins.objectid = disk_bytenr;
2100         ins.offset = disk_num_bytes;
2101         ins.type = BTRFS_EXTENT_ITEM_KEY;
2102         ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
2103                         btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
2104         /*
2105          * Release the reserved range from inode dirty range map, as it is
2106          * already moved into delayed_ref_head
2107          */
2108         btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2109 out:
2110         btrfs_free_path(path);
2111
2112         return ret;
2113 }
2114
2115 /* snapshot-aware defrag */
2116 struct sa_defrag_extent_backref {
2117         struct rb_node node;
2118         struct old_sa_defrag_extent *old;
2119         u64 root_id;
2120         u64 inum;
2121         u64 file_pos;
2122         u64 extent_offset;
2123         u64 num_bytes;
2124         u64 generation;
2125 };
2126
2127 struct old_sa_defrag_extent {
2128         struct list_head list;
2129         struct new_sa_defrag_extent *new;
2130
2131         u64 extent_offset;
2132         u64 bytenr;
2133         u64 offset;
2134         u64 len;
2135         int count;
2136 };
2137
2138 struct new_sa_defrag_extent {
2139         struct rb_root root;
2140         struct list_head head;
2141         struct btrfs_path *path;
2142         struct inode *inode;
2143         u64 file_pos;
2144         u64 len;
2145         u64 bytenr;
2146         u64 disk_len;
2147         u8 compress_type;
2148 };
2149
2150 static int backref_comp(struct sa_defrag_extent_backref *b1,
2151                         struct sa_defrag_extent_backref *b2)
2152 {
2153         if (b1->root_id < b2->root_id)
2154                 return -1;
2155         else if (b1->root_id > b2->root_id)
2156                 return 1;
2157
2158         if (b1->inum < b2->inum)
2159                 return -1;
2160         else if (b1->inum > b2->inum)
2161                 return 1;
2162
2163         if (b1->file_pos < b2->file_pos)
2164                 return -1;
2165         else if (b1->file_pos > b2->file_pos)
2166                 return 1;
2167
2168         /*
2169          * [------------------------------] ===> (a range of space)
2170          *     |<--->|   |<---->| =============> (fs/file tree A)
2171          * |<---------------------------->| ===> (fs/file tree B)
2172          *
2173          * A range of space can refer to two file extents in one tree while
2174          * refer to only one file extent in another tree.
2175          *
2176          * So we may process a disk offset more than one time(two extents in A)
2177          * and locate at the same extent(one extent in B), then insert two same
2178          * backrefs(both refer to the extent in B).
2179          */
2180         return 0;
2181 }
2182
2183 static void backref_insert(struct rb_root *root,
2184                            struct sa_defrag_extent_backref *backref)
2185 {
2186         struct rb_node **p = &root->rb_node;
2187         struct rb_node *parent = NULL;
2188         struct sa_defrag_extent_backref *entry;
2189         int ret;
2190
2191         while (*p) {
2192                 parent = *p;
2193                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2194
2195                 ret = backref_comp(backref, entry);
2196                 if (ret < 0)
2197                         p = &(*p)->rb_left;
2198                 else
2199                         p = &(*p)->rb_right;
2200         }
2201
2202         rb_link_node(&backref->node, parent, p);
2203         rb_insert_color(&backref->node, root);
2204 }
2205
2206 /*
2207  * Note the backref might has changed, and in this case we just return 0.
2208  */
2209 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2210                                        void *ctx)
2211 {
2212         struct btrfs_file_extent_item *extent;
2213         struct old_sa_defrag_extent *old = ctx;
2214         struct new_sa_defrag_extent *new = old->new;
2215         struct btrfs_path *path = new->path;
2216         struct btrfs_key key;
2217         struct btrfs_root *root;
2218         struct sa_defrag_extent_backref *backref;
2219         struct extent_buffer *leaf;
2220         struct inode *inode = new->inode;
2221         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2222         int slot;
2223         int ret;
2224         u64 extent_offset;
2225         u64 num_bytes;
2226
2227         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2228             inum == btrfs_ino(BTRFS_I(inode)))
2229                 return 0;
2230
2231         key.objectid = root_id;
2232         key.type = BTRFS_ROOT_ITEM_KEY;
2233         key.offset = (u64)-1;
2234
2235         root = btrfs_read_fs_root_no_name(fs_info, &key);
2236         if (IS_ERR(root)) {
2237                 if (PTR_ERR(root) == -ENOENT)
2238                         return 0;
2239                 WARN_ON(1);
2240                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2241                          inum, offset, root_id);
2242                 return PTR_ERR(root);
2243         }
2244
2245         key.objectid = inum;
2246         key.type = BTRFS_EXTENT_DATA_KEY;
2247         if (offset > (u64)-1 << 32)
2248                 key.offset = 0;
2249         else
2250                 key.offset = offset;
2251
2252         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2253         if (WARN_ON(ret < 0))
2254                 return ret;
2255         ret = 0;
2256
2257         while (1) {
2258                 cond_resched();
2259
2260                 leaf = path->nodes[0];
2261                 slot = path->slots[0];
2262
2263                 if (slot >= btrfs_header_nritems(leaf)) {
2264                         ret = btrfs_next_leaf(root, path);
2265                         if (ret < 0) {
2266                                 goto out;
2267                         } else if (ret > 0) {
2268                                 ret = 0;
2269                                 goto out;
2270                         }
2271                         continue;
2272                 }
2273
2274                 path->slots[0]++;
2275
2276                 btrfs_item_key_to_cpu(leaf, &key, slot);
2277
2278                 if (key.objectid > inum)
2279                         goto out;
2280
2281                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2282                         continue;
2283
2284                 extent = btrfs_item_ptr(leaf, slot,
2285                                         struct btrfs_file_extent_item);
2286
2287                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2288                         continue;
2289
2290                 /*
2291                  * 'offset' refers to the exact key.offset,
2292                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2293                  * (key.offset - extent_offset).
2294                  */
2295                 if (key.offset != offset)
2296                         continue;
2297
2298                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2299                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2300
2301                 if (extent_offset >= old->extent_offset + old->offset +
2302                     old->len || extent_offset + num_bytes <=
2303                     old->extent_offset + old->offset)
2304                         continue;
2305                 break;
2306         }
2307
2308         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2309         if (!backref) {
2310                 ret = -ENOENT;
2311                 goto out;
2312         }
2313
2314         backref->root_id = root_id;
2315         backref->inum = inum;
2316         backref->file_pos = offset;
2317         backref->num_bytes = num_bytes;
2318         backref->extent_offset = extent_offset;
2319         backref->generation = btrfs_file_extent_generation(leaf, extent);
2320         backref->old = old;
2321         backref_insert(&new->root, backref);
2322         old->count++;
2323 out:
2324         btrfs_release_path(path);
2325         WARN_ON(ret);
2326         return ret;
2327 }
2328
2329 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2330                                    struct new_sa_defrag_extent *new)
2331 {
2332         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2333         struct old_sa_defrag_extent *old, *tmp;
2334         int ret;
2335
2336         new->path = path;
2337
2338         list_for_each_entry_safe(old, tmp, &new->head, list) {
2339                 ret = iterate_inodes_from_logical(old->bytenr +
2340                                                   old->extent_offset, fs_info,
2341                                                   path, record_one_backref,
2342                                                   old);
2343                 if (ret < 0 && ret != -ENOENT)
2344                         return false;
2345
2346                 /* no backref to be processed for this extent */
2347                 if (!old->count) {
2348                         list_del(&old->list);
2349                         kfree(old);
2350                 }
2351         }
2352
2353         if (list_empty(&new->head))
2354                 return false;
2355
2356         return true;
2357 }
2358
2359 static int relink_is_mergable(struct extent_buffer *leaf,
2360                               struct btrfs_file_extent_item *fi,
2361                               struct new_sa_defrag_extent *new)
2362 {
2363         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2364                 return 0;
2365
2366         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2367                 return 0;
2368
2369         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2370                 return 0;
2371
2372         if (btrfs_file_extent_encryption(leaf, fi) ||
2373             btrfs_file_extent_other_encoding(leaf, fi))
2374                 return 0;
2375
2376         return 1;
2377 }
2378
2379 /*
2380  * Note the backref might has changed, and in this case we just return 0.
2381  */
2382 static noinline int relink_extent_backref(struct btrfs_path *path,
2383                                  struct sa_defrag_extent_backref *prev,
2384                                  struct sa_defrag_extent_backref *backref)
2385 {
2386         struct btrfs_file_extent_item *extent;
2387         struct btrfs_file_extent_item *item;
2388         struct btrfs_ordered_extent *ordered;
2389         struct btrfs_trans_handle *trans;
2390         struct btrfs_root *root;
2391         struct btrfs_key key;
2392         struct extent_buffer *leaf;
2393         struct old_sa_defrag_extent *old = backref->old;
2394         struct new_sa_defrag_extent *new = old->new;
2395         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2396         struct inode *inode;
2397         struct extent_state *cached = NULL;
2398         int ret = 0;
2399         u64 start;
2400         u64 len;
2401         u64 lock_start;
2402         u64 lock_end;
2403         bool merge = false;
2404         int index;
2405
2406         if (prev && prev->root_id == backref->root_id &&
2407             prev->inum == backref->inum &&
2408             prev->file_pos + prev->num_bytes == backref->file_pos)
2409                 merge = true;
2410
2411         /* step 1: get root */
2412         key.objectid = backref->root_id;
2413         key.type = BTRFS_ROOT_ITEM_KEY;
2414         key.offset = (u64)-1;
2415
2416         index = srcu_read_lock(&fs_info->subvol_srcu);
2417
2418         root = btrfs_read_fs_root_no_name(fs_info, &key);
2419         if (IS_ERR(root)) {
2420                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2421                 if (PTR_ERR(root) == -ENOENT)
2422                         return 0;
2423                 return PTR_ERR(root);
2424         }
2425
2426         if (btrfs_root_readonly(root)) {
2427                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2428                 return 0;
2429         }
2430
2431         /* step 2: get inode */
2432         key.objectid = backref->inum;
2433         key.type = BTRFS_INODE_ITEM_KEY;
2434         key.offset = 0;
2435
2436         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2437         if (IS_ERR(inode)) {
2438                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2439                 return 0;
2440         }
2441
2442         srcu_read_unlock(&fs_info->subvol_srcu, index);
2443
2444         /* step 3: relink backref */
2445         lock_start = backref->file_pos;
2446         lock_end = backref->file_pos + backref->num_bytes - 1;
2447         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2448                          &cached);
2449
2450         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2451         if (ordered) {
2452                 btrfs_put_ordered_extent(ordered);
2453                 goto out_unlock;
2454         }
2455
2456         trans = btrfs_join_transaction(root);
2457         if (IS_ERR(trans)) {
2458                 ret = PTR_ERR(trans);
2459                 goto out_unlock;
2460         }
2461
2462         key.objectid = backref->inum;
2463         key.type = BTRFS_EXTENT_DATA_KEY;
2464         key.offset = backref->file_pos;
2465
2466         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2467         if (ret < 0) {
2468                 goto out_free_path;
2469         } else if (ret > 0) {
2470                 ret = 0;
2471                 goto out_free_path;
2472         }
2473
2474         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2475                                 struct btrfs_file_extent_item);
2476
2477         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2478             backref->generation)
2479                 goto out_free_path;
2480
2481         btrfs_release_path(path);
2482
2483         start = backref->file_pos;
2484         if (backref->extent_offset < old->extent_offset + old->offset)
2485                 start += old->extent_offset + old->offset -
2486                          backref->extent_offset;
2487
2488         len = min(backref->extent_offset + backref->num_bytes,
2489                   old->extent_offset + old->offset + old->len);
2490         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2491
2492         ret = btrfs_drop_extents(trans, root, inode, start,
2493                                  start + len, 1);
2494         if (ret)
2495                 goto out_free_path;
2496 again:
2497         key.objectid = btrfs_ino(BTRFS_I(inode));
2498         key.type = BTRFS_EXTENT_DATA_KEY;
2499         key.offset = start;
2500
2501         path->leave_spinning = 1;
2502         if (merge) {
2503                 struct btrfs_file_extent_item *fi;
2504                 u64 extent_len;
2505                 struct btrfs_key found_key;
2506
2507                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2508                 if (ret < 0)
2509                         goto out_free_path;
2510
2511                 path->slots[0]--;
2512                 leaf = path->nodes[0];
2513                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2514
2515                 fi = btrfs_item_ptr(leaf, path->slots[0],
2516                                     struct btrfs_file_extent_item);
2517                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2518
2519                 if (extent_len + found_key.offset == start &&
2520                     relink_is_mergable(leaf, fi, new)) {
2521                         btrfs_set_file_extent_num_bytes(leaf, fi,
2522                                                         extent_len + len);
2523                         btrfs_mark_buffer_dirty(leaf);
2524                         inode_add_bytes(inode, len);
2525
2526                         ret = 1;
2527                         goto out_free_path;
2528                 } else {
2529                         merge = false;
2530                         btrfs_release_path(path);
2531                         goto again;
2532                 }
2533         }
2534
2535         ret = btrfs_insert_empty_item(trans, root, path, &key,
2536                                         sizeof(*extent));
2537         if (ret) {
2538                 btrfs_abort_transaction(trans, ret);
2539                 goto out_free_path;
2540         }
2541
2542         leaf = path->nodes[0];
2543         item = btrfs_item_ptr(leaf, path->slots[0],
2544                                 struct btrfs_file_extent_item);
2545         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2546         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2547         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2548         btrfs_set_file_extent_num_bytes(leaf, item, len);
2549         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2550         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2551         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2552         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2553         btrfs_set_file_extent_encryption(leaf, item, 0);
2554         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2555
2556         btrfs_mark_buffer_dirty(leaf);
2557         inode_add_bytes(inode, len);
2558         btrfs_release_path(path);
2559
2560         ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
2561                         new->disk_len, 0,
2562                         backref->root_id, backref->inum,
2563                         new->file_pos); /* start - extent_offset */
2564         if (ret) {
2565                 btrfs_abort_transaction(trans, ret);
2566                 goto out_free_path;
2567         }
2568
2569         ret = 1;
2570 out_free_path:
2571         btrfs_release_path(path);
2572         path->leave_spinning = 0;
2573         btrfs_end_transaction(trans);
2574 out_unlock:
2575         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2576                              &cached, GFP_NOFS);
2577         iput(inode);
2578         return ret;
2579 }
2580
2581 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2582 {
2583         struct old_sa_defrag_extent *old, *tmp;
2584
2585         if (!new)
2586                 return;
2587
2588         list_for_each_entry_safe(old, tmp, &new->head, list) {
2589                 kfree(old);
2590         }
2591         kfree(new);
2592 }
2593
2594 static void relink_file_extents(struct new_sa_defrag_extent *new)
2595 {
2596         struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2597         struct btrfs_path *path;
2598         struct sa_defrag_extent_backref *backref;
2599         struct sa_defrag_extent_backref *prev = NULL;
2600         struct inode *inode;
2601         struct btrfs_root *root;
2602         struct rb_node *node;
2603         int ret;
2604
2605         inode = new->inode;
2606         root = BTRFS_I(inode)->root;
2607
2608         path = btrfs_alloc_path();
2609         if (!path)
2610                 return;
2611
2612         if (!record_extent_backrefs(path, new)) {
2613                 btrfs_free_path(path);
2614                 goto out;
2615         }
2616         btrfs_release_path(path);
2617
2618         while (1) {
2619                 node = rb_first(&new->root);
2620                 if (!node)
2621                         break;
2622                 rb_erase(node, &new->root);
2623
2624                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2625
2626                 ret = relink_extent_backref(path, prev, backref);
2627                 WARN_ON(ret < 0);
2628
2629                 kfree(prev);
2630
2631                 if (ret == 1)
2632                         prev = backref;
2633                 else
2634                         prev = NULL;
2635                 cond_resched();
2636         }
2637         kfree(prev);
2638
2639         btrfs_free_path(path);
2640 out:
2641         free_sa_defrag_extent(new);
2642
2643         atomic_dec(&fs_info->defrag_running);
2644         wake_up(&fs_info->transaction_wait);
2645 }
2646
2647 static struct new_sa_defrag_extent *
2648 record_old_file_extents(struct inode *inode,
2649                         struct btrfs_ordered_extent *ordered)
2650 {
2651         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2652         struct btrfs_root *root = BTRFS_I(inode)->root;
2653         struct btrfs_path *path;
2654         struct btrfs_key key;
2655         struct old_sa_defrag_extent *old;
2656         struct new_sa_defrag_extent *new;
2657         int ret;
2658
2659         new = kmalloc(sizeof(*new), GFP_NOFS);
2660         if (!new)
2661                 return NULL;
2662
2663         new->inode = inode;
2664         new->file_pos = ordered->file_offset;
2665         new->len = ordered->len;
2666         new->bytenr = ordered->start;
2667         new->disk_len = ordered->disk_len;
2668         new->compress_type = ordered->compress_type;
2669         new->root = RB_ROOT;
2670         INIT_LIST_HEAD(&new->head);
2671
2672         path = btrfs_alloc_path();
2673         if (!path)
2674                 goto out_kfree;
2675
2676         key.objectid = btrfs_ino(BTRFS_I(inode));
2677         key.type = BTRFS_EXTENT_DATA_KEY;
2678         key.offset = new->file_pos;
2679
2680         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2681         if (ret < 0)
2682                 goto out_free_path;
2683         if (ret > 0 && path->slots[0] > 0)
2684                 path->slots[0]--;
2685
2686         /* find out all the old extents for the file range */
2687         while (1) {
2688                 struct btrfs_file_extent_item *extent;
2689                 struct extent_buffer *l;
2690                 int slot;
2691                 u64 num_bytes;
2692                 u64 offset;
2693                 u64 end;
2694                 u64 disk_bytenr;
2695                 u64 extent_offset;
2696
2697                 l = path->nodes[0];
2698                 slot = path->slots[0];
2699
2700                 if (slot >= btrfs_header_nritems(l)) {
2701                         ret = btrfs_next_leaf(root, path);
2702                         if (ret < 0)
2703                                 goto out_free_path;
2704                         else if (ret > 0)
2705                                 break;
2706                         continue;
2707                 }
2708
2709                 btrfs_item_key_to_cpu(l, &key, slot);
2710
2711                 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2712                         break;
2713                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2714                         break;
2715                 if (key.offset >= new->file_pos + new->len)
2716                         break;
2717
2718                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2719
2720                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2721                 if (key.offset + num_bytes < new->file_pos)
2722                         goto next;
2723
2724                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2725                 if (!disk_bytenr)
2726                         goto next;
2727
2728                 extent_offset = btrfs_file_extent_offset(l, extent);
2729
2730                 old = kmalloc(sizeof(*old), GFP_NOFS);
2731                 if (!old)
2732                         goto out_free_path;
2733
2734                 offset = max(new->file_pos, key.offset);
2735                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2736
2737                 old->bytenr = disk_bytenr;
2738                 old->extent_offset = extent_offset;
2739                 old->offset = offset - key.offset;
2740                 old->len = end - offset;
2741                 old->new = new;
2742                 old->count = 0;
2743                 list_add_tail(&old->list, &new->head);
2744 next:
2745                 path->slots[0]++;
2746                 cond_resched();
2747         }
2748
2749         btrfs_free_path(path);
2750         atomic_inc(&fs_info->defrag_running);
2751
2752         return new;
2753
2754 out_free_path:
2755         btrfs_free_path(path);
2756 out_kfree:
2757         free_sa_defrag_extent(new);
2758         return NULL;
2759 }
2760
2761 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2762                                          u64 start, u64 len)
2763 {
2764         struct btrfs_block_group_cache *cache;
2765
2766         cache = btrfs_lookup_block_group(fs_info, start);
2767         ASSERT(cache);
2768
2769         spin_lock(&cache->lock);
2770         cache->delalloc_bytes -= len;
2771         spin_unlock(&cache->lock);
2772
2773         btrfs_put_block_group(cache);
2774 }
2775
2776 /* as ordered data IO finishes, this gets called so we can finish
2777  * an ordered extent if the range of bytes in the file it covers are
2778  * fully written.
2779  */
2780 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2781 {
2782         struct inode *inode = ordered_extent->inode;
2783         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2784         struct btrfs_root *root = BTRFS_I(inode)->root;
2785         struct btrfs_trans_handle *trans = NULL;
2786         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2787         struct extent_state *cached_state = NULL;
2788         struct new_sa_defrag_extent *new = NULL;
2789         int compress_type = 0;
2790         int ret = 0;
2791         u64 logical_len = ordered_extent->len;
2792         bool nolock;
2793         bool truncated = false;
2794
2795         nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2796
2797         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2798                 ret = -EIO;
2799                 goto out;
2800         }
2801
2802         btrfs_free_io_failure_record(BTRFS_I(inode),
2803                         ordered_extent->file_offset,
2804                         ordered_extent->file_offset +
2805                         ordered_extent->len - 1);
2806
2807         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2808                 truncated = true;
2809                 logical_len = ordered_extent->truncated_len;
2810                 /* Truncated the entire extent, don't bother adding */
2811                 if (!logical_len)
2812                         goto out;
2813         }
2814
2815         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2816                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2817
2818                 /*
2819                  * For mwrite(mmap + memset to write) case, we still reserve
2820                  * space for NOCOW range.
2821                  * As NOCOW won't cause a new delayed ref, just free the space
2822                  */
2823                 btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2824                                        ordered_extent->len);
2825                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2826                 if (nolock)
2827                         trans = btrfs_join_transaction_nolock(root);
2828                 else
2829                         trans = btrfs_join_transaction(root);
2830                 if (IS_ERR(trans)) {
2831                         ret = PTR_ERR(trans);
2832                         trans = NULL;
2833                         goto out;
2834                 }
2835                 trans->block_rsv = &fs_info->delalloc_block_rsv;
2836                 ret = btrfs_update_inode_fallback(trans, root, inode);
2837                 if (ret) /* -ENOMEM or corruption */
2838                         btrfs_abort_transaction(trans, ret);
2839                 goto out;
2840         }
2841
2842         lock_extent_bits(io_tree, ordered_extent->file_offset,
2843                          ordered_extent->file_offset + ordered_extent->len - 1,
2844                          &cached_state);
2845
2846         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2847                         ordered_extent->file_offset + ordered_extent->len - 1,
2848                         EXTENT_DEFRAG, 1, cached_state);
2849         if (ret) {
2850                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2851                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2852                         /* the inode is shared */
2853                         new = record_old_file_extents(inode, ordered_extent);
2854
2855                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2856                         ordered_extent->file_offset + ordered_extent->len - 1,
2857                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2858         }
2859
2860         if (nolock)
2861                 trans = btrfs_join_transaction_nolock(root);
2862         else
2863                 trans = btrfs_join_transaction(root);
2864         if (IS_ERR(trans)) {
2865                 ret = PTR_ERR(trans);
2866                 trans = NULL;
2867                 goto out_unlock;
2868         }
2869
2870         trans->block_rsv = &fs_info->delalloc_block_rsv;
2871
2872         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2873                 compress_type = ordered_extent->compress_type;
2874         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2875                 BUG_ON(compress_type);
2876                 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
2877                                                 ordered_extent->file_offset,
2878                                                 ordered_extent->file_offset +
2879                                                 logical_len);
2880         } else {
2881                 BUG_ON(root == fs_info->tree_root);
2882                 ret = insert_reserved_file_extent(trans, inode,
2883                                                 ordered_extent->file_offset,
2884                                                 ordered_extent->start,
2885                                                 ordered_extent->disk_len,
2886                                                 logical_len, logical_len,
2887                                                 compress_type, 0, 0,
2888                                                 BTRFS_FILE_EXTENT_REG);
2889                 if (!ret)
2890                         btrfs_release_delalloc_bytes(fs_info,
2891                                                      ordered_extent->start,
2892                                                      ordered_extent->disk_len);
2893         }
2894         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2895                            ordered_extent->file_offset, ordered_extent->len,
2896                            trans->transid);
2897         if (ret < 0) {
2898                 btrfs_abort_transaction(trans, ret);
2899                 goto out_unlock;
2900         }
2901
2902         add_pending_csums(trans, inode, &ordered_extent->list);
2903
2904         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2905         ret = btrfs_update_inode_fallback(trans, root, inode);
2906         if (ret) { /* -ENOMEM or corruption */
2907                 btrfs_abort_transaction(trans, ret);
2908                 goto out_unlock;
2909         }
2910         ret = 0;
2911 out_unlock:
2912         unlock_extent_cached(io_tree, ordered_extent->file_offset,
2913                              ordered_extent->file_offset +
2914                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
2915 out:
2916         if (root != fs_info->tree_root)
2917                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
2918                                 ordered_extent->len);
2919         if (trans)
2920                 btrfs_end_transaction(trans);
2921
2922         if (ret || truncated) {
2923                 u64 start, end;
2924
2925                 if (truncated)
2926                         start = ordered_extent->file_offset + logical_len;
2927                 else
2928                         start = ordered_extent->file_offset;
2929                 end = ordered_extent->file_offset + ordered_extent->len - 1;
2930                 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2931
2932                 /* Drop the cache for the part of the extent we didn't write. */
2933                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
2934
2935                 /*
2936                  * If the ordered extent had an IOERR or something else went
2937                  * wrong we need to return the space for this ordered extent
2938                  * back to the allocator.  We only free the extent in the
2939                  * truncated case if we didn't write out the extent at all.
2940                  */
2941                 if ((ret || !logical_len) &&
2942                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2943                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2944                         btrfs_free_reserved_extent(fs_info,
2945                                                    ordered_extent->start,
2946                                                    ordered_extent->disk_len, 1);
2947         }
2948
2949
2950         /*
2951          * This needs to be done to make sure anybody waiting knows we are done
2952          * updating everything for this ordered extent.
2953          */
2954         btrfs_remove_ordered_extent(inode, ordered_extent);
2955
2956         /* for snapshot-aware defrag */
2957         if (new) {
2958                 if (ret) {
2959                         free_sa_defrag_extent(new);
2960                         atomic_dec(&fs_info->defrag_running);
2961                 } else {
2962                         relink_file_extents(new);
2963                 }
2964         }
2965
2966         /* once for us */
2967         btrfs_put_ordered_extent(ordered_extent);
2968         /* once for the tree */
2969         btrfs_put_ordered_extent(ordered_extent);
2970
2971         return ret;
2972 }
2973
2974 static void finish_ordered_fn(struct btrfs_work *work)
2975 {
2976         struct btrfs_ordered_extent *ordered_extent;
2977         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2978         btrfs_finish_ordered_io(ordered_extent);
2979 }
2980
2981 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2982                                 struct extent_state *state, int uptodate)
2983 {
2984         struct inode *inode = page->mapping->host;
2985         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2986         struct btrfs_ordered_extent *ordered_extent = NULL;
2987         struct btrfs_workqueue *wq;
2988         btrfs_work_func_t func;
2989
2990         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2991
2992         ClearPagePrivate2(page);
2993         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2994                                             end - start + 1, uptodate))
2995                 return;
2996
2997         if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
2998                 wq = fs_info->endio_freespace_worker;
2999                 func = btrfs_freespace_write_helper;
3000         } else {
3001                 wq = fs_info->endio_write_workers;
3002                 func = btrfs_endio_write_helper;
3003         }
3004
3005         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3006                         NULL);
3007         btrfs_queue_work(wq, &ordered_extent->work);
3008 }
3009
3010 static int __readpage_endio_check(struct inode *inode,
3011                                   struct btrfs_io_bio *io_bio,
3012                                   int icsum, struct page *page,
3013                                   int pgoff, u64 start, size_t len)
3014 {
3015         char *kaddr;
3016         u32 csum_expected;
3017         u32 csum = ~(u32)0;
3018
3019         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3020
3021         kaddr = kmap_atomic(page);
3022         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3023         btrfs_csum_final(csum, (u8 *)&csum);
3024         if (csum != csum_expected)
3025                 goto zeroit;
3026
3027         kunmap_atomic(kaddr);
3028         return 0;
3029 zeroit:
3030         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3031                                     io_bio->mirror_num);
3032         memset(kaddr + pgoff, 1, len);
3033         flush_dcache_page(page);
3034         kunmap_atomic(kaddr);
3035         if (csum_expected == 0)
3036                 return 0;
3037         return -EIO;
3038 }
3039
3040 /*
3041  * when reads are done, we need to check csums to verify the data is correct
3042  * if there's a match, we allow the bio to finish.  If not, the code in
3043  * extent_io.c will try to find good copies for us.
3044  */
3045 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3046                                       u64 phy_offset, struct page *page,
3047                                       u64 start, u64 end, int mirror)
3048 {
3049         size_t offset = start - page_offset(page);
3050         struct inode *inode = page->mapping->host;
3051         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3052         struct btrfs_root *root = BTRFS_I(inode)->root;
3053
3054         if (PageChecked(page)) {
3055                 ClearPageChecked(page);
3056                 return 0;
3057         }
3058
3059         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3060                 return 0;
3061
3062         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3063             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3064                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3065                 return 0;
3066         }
3067
3068         phy_offset >>= inode->i_sb->s_blocksize_bits;
3069         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3070                                       start, (size_t)(end - start + 1));
3071 }
3072
3073 void btrfs_add_delayed_iput(struct inode *inode)
3074 {
3075         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3076         struct btrfs_inode *binode = BTRFS_I(inode);
3077
3078         if (atomic_add_unless(&inode->i_count, -1, 1))
3079                 return;
3080
3081         spin_lock(&fs_info->delayed_iput_lock);
3082         if (binode->delayed_iput_count == 0) {
3083                 ASSERT(list_empty(&binode->delayed_iput));
3084                 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3085         } else {
3086                 binode->delayed_iput_count++;
3087         }
3088         spin_unlock(&fs_info->delayed_iput_lock);
3089 }
3090
3091 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3092 {
3093
3094         spin_lock(&fs_info->delayed_iput_lock);
3095         while (!list_empty(&fs_info->delayed_iputs)) {
3096                 struct btrfs_inode *inode;
3097
3098                 inode = list_first_entry(&fs_info->delayed_iputs,
3099                                 struct btrfs_inode, delayed_iput);
3100                 if (inode->delayed_iput_count) {
3101                         inode->delayed_iput_count--;
3102                         list_move_tail(&inode->delayed_iput,
3103                                         &fs_info->delayed_iputs);
3104                 } else {
3105                         list_del_init(&inode->delayed_iput);
3106                 }
3107                 spin_unlock(&fs_info->delayed_iput_lock);
3108                 iput(&inode->vfs_inode);
3109                 spin_lock(&fs_info->delayed_iput_lock);
3110         }
3111         spin_unlock(&fs_info->delayed_iput_lock);
3112 }
3113
3114 /*
3115  * This is called in transaction commit time. If there are no orphan
3116  * files in the subvolume, it removes orphan item and frees block_rsv
3117  * structure.
3118  */
3119 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3120                               struct btrfs_root *root)
3121 {
3122         struct btrfs_fs_info *fs_info = root->fs_info;
3123         struct btrfs_block_rsv *block_rsv;
3124         int ret;
3125
3126         if (atomic_read(&root->orphan_inodes) ||
3127             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3128                 return;
3129
3130         spin_lock(&root->orphan_lock);
3131         if (atomic_read(&root->orphan_inodes)) {
3132                 spin_unlock(&root->orphan_lock);
3133                 return;
3134         }
3135
3136         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3137                 spin_unlock(&root->orphan_lock);
3138                 return;
3139         }
3140
3141         block_rsv = root->orphan_block_rsv;
3142         root->orphan_block_rsv = NULL;
3143         spin_unlock(&root->orphan_lock);
3144
3145         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3146             btrfs_root_refs(&root->root_item) > 0) {
3147                 ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3148                                             root->root_key.objectid);
3149                 if (ret)
3150                         btrfs_abort_transaction(trans, ret);
3151                 else
3152                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3153                                   &root->state);
3154         }
3155
3156         if (block_rsv) {
3157                 WARN_ON(block_rsv->size > 0);
3158                 btrfs_free_block_rsv(fs_info, block_rsv);
3159         }
3160 }
3161
3162 /*
3163  * This creates an orphan entry for the given inode in case something goes
3164  * wrong in the middle of an unlink/truncate.
3165  *
3166  * NOTE: caller of this function should reserve 5 units of metadata for
3167  *       this function.
3168  */
3169 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3170                 struct btrfs_inode *inode)
3171 {
3172         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3173         struct btrfs_root *root = inode->root;
3174         struct btrfs_block_rsv *block_rsv = NULL;
3175         int reserve = 0;
3176         int insert = 0;
3177         int ret;
3178
3179         if (!root->orphan_block_rsv) {
3180                 block_rsv = btrfs_alloc_block_rsv(fs_info,
3181                                                   BTRFS_BLOCK_RSV_TEMP);
3182                 if (!block_rsv)
3183                         return -ENOMEM;
3184         }
3185
3186         spin_lock(&root->orphan_lock);
3187         if (!root->orphan_block_rsv) {
3188                 root->orphan_block_rsv = block_rsv;
3189         } else if (block_rsv) {
3190                 btrfs_free_block_rsv(fs_info, block_rsv);
3191                 block_rsv = NULL;
3192         }
3193
3194         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3195                               &inode->runtime_flags)) {
3196 #if 0
3197                 /*
3198                  * For proper ENOSPC handling, we should do orphan
3199                  * cleanup when mounting. But this introduces backward
3200                  * compatibility issue.
3201                  */
3202                 if (!xchg(&root->orphan_item_inserted, 1))
3203                         insert = 2;
3204                 else
3205                         insert = 1;
3206 #endif
3207                 insert = 1;
3208                 atomic_inc(&root->orphan_inodes);
3209         }
3210
3211         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3212                               &inode->runtime_flags))
3213                 reserve = 1;
3214         spin_unlock(&root->orphan_lock);
3215
3216         /* grab metadata reservation from transaction handle */
3217         if (reserve) {
3218                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3219                 ASSERT(!ret);
3220                 if (ret) {
3221                         atomic_dec(&root->orphan_inodes);
3222                         clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3223                                   &inode->runtime_flags);
3224                         if (insert)
3225                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3226                                           &inode->runtime_flags);
3227                         return ret;
3228                 }
3229         }
3230
3231         /* insert an orphan item to track this unlinked/truncated file */
3232         if (insert >= 1) {
3233                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3234                 if (ret) {
3235                         atomic_dec(&root->orphan_inodes);
3236                         if (reserve) {
3237                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3238                                           &inode->runtime_flags);
3239                                 btrfs_orphan_release_metadata(inode);
3240                         }
3241                         if (ret != -EEXIST) {
3242                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3243                                           &inode->runtime_flags);
3244                                 btrfs_abort_transaction(trans, ret);
3245                                 return ret;
3246                         }
3247                 }
3248                 ret = 0;