btrfs: Drop inode if inode root is NULL
[muen/linux.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include <linux/btrfs.h>
43 #include <linux/blkdev.h>
44 #include "compat.h"
45 #include "ctree.h"
46 #include "disk-io.h"
47 #include "transaction.h"
48 #include "btrfs_inode.h"
49 #include "print-tree.h"
50 #include "ordered-data.h"
51 #include "xattr.h"
52 #include "tree-log.h"
53 #include "volumes.h"
54 #include "compression.h"
55 #include "locking.h"
56 #include "free-space-cache.h"
57 #include "inode-map.h"
58 #include "backref.h"
59
60 struct btrfs_iget_args {
61         u64 ino;
62         struct btrfs_root *root;
63 };
64
65 static const struct inode_operations btrfs_dir_inode_operations;
66 static const struct inode_operations btrfs_symlink_inode_operations;
67 static const struct inode_operations btrfs_dir_ro_inode_operations;
68 static const struct inode_operations btrfs_special_inode_operations;
69 static const struct inode_operations btrfs_file_inode_operations;
70 static const struct address_space_operations btrfs_aops;
71 static const struct address_space_operations btrfs_symlink_aops;
72 static const struct file_operations btrfs_dir_file_operations;
73 static struct extent_io_ops btrfs_extent_io_ops;
74
75 static struct kmem_cache *btrfs_inode_cachep;
76 static struct kmem_cache *btrfs_delalloc_work_cachep;
77 struct kmem_cache *btrfs_trans_handle_cachep;
78 struct kmem_cache *btrfs_transaction_cachep;
79 struct kmem_cache *btrfs_path_cachep;
80 struct kmem_cache *btrfs_free_space_cachep;
81
82 #define S_SHIFT 12
83 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
84         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
85         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
86         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
87         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
88         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
89         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
90         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
91 };
92
93 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
94 static int btrfs_truncate(struct inode *inode);
95 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
96 static noinline int cow_file_range(struct inode *inode,
97                                    struct page *locked_page,
98                                    u64 start, u64 end, int *page_started,
99                                    unsigned long *nr_written, int unlock);
100 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
101                                            u64 len, u64 orig_start,
102                                            u64 block_start, u64 block_len,
103                                            u64 orig_block_len, u64 ram_bytes,
104                                            int type);
105
106 static int btrfs_dirty_inode(struct inode *inode);
107
108 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
109                                      struct inode *inode,  struct inode *dir,
110                                      const struct qstr *qstr)
111 {
112         int err;
113
114         err = btrfs_init_acl(trans, inode, dir);
115         if (!err)
116                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
117         return err;
118 }
119
120 /*
121  * this does all the hard work for inserting an inline extent into
122  * the btree.  The caller should have done a btrfs_drop_extents so that
123  * no overlapping inline items exist in the btree
124  */
125 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
126                                 struct btrfs_root *root, struct inode *inode,
127                                 u64 start, size_t size, size_t compressed_size,
128                                 int compress_type,
129                                 struct page **compressed_pages)
130 {
131         struct btrfs_key key;
132         struct btrfs_path *path;
133         struct extent_buffer *leaf;
134         struct page *page = NULL;
135         char *kaddr;
136         unsigned long ptr;
137         struct btrfs_file_extent_item *ei;
138         int err = 0;
139         int ret;
140         size_t cur_size = size;
141         size_t datasize;
142         unsigned long offset;
143
144         if (compressed_size && compressed_pages)
145                 cur_size = compressed_size;
146
147         path = btrfs_alloc_path();
148         if (!path)
149                 return -ENOMEM;
150
151         path->leave_spinning = 1;
152
153         key.objectid = btrfs_ino(inode);
154         key.offset = start;
155         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
156         datasize = btrfs_file_extent_calc_inline_size(cur_size);
157
158         inode_add_bytes(inode, size);
159         ret = btrfs_insert_empty_item(trans, root, path, &key,
160                                       datasize);
161         if (ret) {
162                 err = ret;
163                 goto fail;
164         }
165         leaf = path->nodes[0];
166         ei = btrfs_item_ptr(leaf, path->slots[0],
167                             struct btrfs_file_extent_item);
168         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
169         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
170         btrfs_set_file_extent_encryption(leaf, ei, 0);
171         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
172         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
173         ptr = btrfs_file_extent_inline_start(ei);
174
175         if (compress_type != BTRFS_COMPRESS_NONE) {
176                 struct page *cpage;
177                 int i = 0;
178                 while (compressed_size > 0) {
179                         cpage = compressed_pages[i];
180                         cur_size = min_t(unsigned long, compressed_size,
181                                        PAGE_CACHE_SIZE);
182
183                         kaddr = kmap_atomic(cpage);
184                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
185                         kunmap_atomic(kaddr);
186
187                         i++;
188                         ptr += cur_size;
189                         compressed_size -= cur_size;
190                 }
191                 btrfs_set_file_extent_compression(leaf, ei,
192                                                   compress_type);
193         } else {
194                 page = find_get_page(inode->i_mapping,
195                                      start >> PAGE_CACHE_SHIFT);
196                 btrfs_set_file_extent_compression(leaf, ei, 0);
197                 kaddr = kmap_atomic(page);
198                 offset = start & (PAGE_CACHE_SIZE - 1);
199                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
200                 kunmap_atomic(kaddr);
201                 page_cache_release(page);
202         }
203         btrfs_mark_buffer_dirty(leaf);
204         btrfs_free_path(path);
205
206         /*
207          * we're an inline extent, so nobody can
208          * extend the file past i_size without locking
209          * a page we already have locked.
210          *
211          * We must do any isize and inode updates
212          * before we unlock the pages.  Otherwise we
213          * could end up racing with unlink.
214          */
215         BTRFS_I(inode)->disk_i_size = inode->i_size;
216         ret = btrfs_update_inode(trans, root, inode);
217
218         return ret;
219 fail:
220         btrfs_free_path(path);
221         return err;
222 }
223
224
225 /*
226  * conditionally insert an inline extent into the file.  This
227  * does the checks required to make sure the data is small enough
228  * to fit as an inline extent.
229  */
230 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
231                                  struct btrfs_root *root,
232                                  struct inode *inode, u64 start, u64 end,
233                                  size_t compressed_size, int compress_type,
234                                  struct page **compressed_pages)
235 {
236         u64 isize = i_size_read(inode);
237         u64 actual_end = min(end + 1, isize);
238         u64 inline_len = actual_end - start;
239         u64 aligned_end = ALIGN(end, root->sectorsize);
240         u64 data_len = inline_len;
241         int ret;
242
243         if (compressed_size)
244                 data_len = compressed_size;
245
246         if (start > 0 ||
247             actual_end >= PAGE_CACHE_SIZE ||
248             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
249             (!compressed_size &&
250             (actual_end & (root->sectorsize - 1)) == 0) ||
251             end + 1 < isize ||
252             data_len > root->fs_info->max_inline) {
253                 return 1;
254         }
255
256         ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
257         if (ret)
258                 return ret;
259
260         if (isize > actual_end)
261                 inline_len = min_t(u64, isize, actual_end);
262         ret = insert_inline_extent(trans, root, inode, start,
263                                    inline_len, compressed_size,
264                                    compress_type, compressed_pages);
265         if (ret && ret != -ENOSPC) {
266                 btrfs_abort_transaction(trans, root, ret);
267                 return ret;
268         } else if (ret == -ENOSPC) {
269                 return 1;
270         }
271
272         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
273         btrfs_delalloc_release_metadata(inode, end + 1 - start);
274         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
275         return 0;
276 }
277
278 struct async_extent {
279         u64 start;
280         u64 ram_size;
281         u64 compressed_size;
282         struct page **pages;
283         unsigned long nr_pages;
284         int compress_type;
285         struct list_head list;
286 };
287
288 struct async_cow {
289         struct inode *inode;
290         struct btrfs_root *root;
291         struct page *locked_page;
292         u64 start;
293         u64 end;
294         struct list_head extents;
295         struct btrfs_work work;
296 };
297
298 static noinline int add_async_extent(struct async_cow *cow,
299                                      u64 start, u64 ram_size,
300                                      u64 compressed_size,
301                                      struct page **pages,
302                                      unsigned long nr_pages,
303                                      int compress_type)
304 {
305         struct async_extent *async_extent;
306
307         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
308         BUG_ON(!async_extent); /* -ENOMEM */
309         async_extent->start = start;
310         async_extent->ram_size = ram_size;
311         async_extent->compressed_size = compressed_size;
312         async_extent->pages = pages;
313         async_extent->nr_pages = nr_pages;
314         async_extent->compress_type = compress_type;
315         list_add_tail(&async_extent->list, &cow->extents);
316         return 0;
317 }
318
319 /*
320  * we create compressed extents in two phases.  The first
321  * phase compresses a range of pages that have already been
322  * locked (both pages and state bits are locked).
323  *
324  * This is done inside an ordered work queue, and the compression
325  * is spread across many cpus.  The actual IO submission is step
326  * two, and the ordered work queue takes care of making sure that
327  * happens in the same order things were put onto the queue by
328  * writepages and friends.
329  *
330  * If this code finds it can't get good compression, it puts an
331  * entry onto the work queue to write the uncompressed bytes.  This
332  * makes sure that both compressed inodes and uncompressed inodes
333  * are written in the same order that the flusher thread sent them
334  * down.
335  */
336 static noinline int compress_file_range(struct inode *inode,
337                                         struct page *locked_page,
338                                         u64 start, u64 end,
339                                         struct async_cow *async_cow,
340                                         int *num_added)
341 {
342         struct btrfs_root *root = BTRFS_I(inode)->root;
343         struct btrfs_trans_handle *trans;
344         u64 num_bytes;
345         u64 blocksize = root->sectorsize;
346         u64 actual_end;
347         u64 isize = i_size_read(inode);
348         int ret = 0;
349         struct page **pages = NULL;
350         unsigned long nr_pages;
351         unsigned long nr_pages_ret = 0;
352         unsigned long total_compressed = 0;
353         unsigned long total_in = 0;
354         unsigned long max_compressed = 128 * 1024;
355         unsigned long max_uncompressed = 128 * 1024;
356         int i;
357         int will_compress;
358         int compress_type = root->fs_info->compress_type;
359         int redirty = 0;
360
361         /* if this is a small write inside eof, kick off a defrag */
362         if ((end - start + 1) < 16 * 1024 &&
363             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
364                 btrfs_add_inode_defrag(NULL, inode);
365
366         actual_end = min_t(u64, isize, end + 1);
367 again:
368         will_compress = 0;
369         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
370         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
371
372         /*
373          * we don't want to send crud past the end of i_size through
374          * compression, that's just a waste of CPU time.  So, if the
375          * end of the file is before the start of our current
376          * requested range of bytes, we bail out to the uncompressed
377          * cleanup code that can deal with all of this.
378          *
379          * It isn't really the fastest way to fix things, but this is a
380          * very uncommon corner.
381          */
382         if (actual_end <= start)
383                 goto cleanup_and_bail_uncompressed;
384
385         total_compressed = actual_end - start;
386
387         /* we want to make sure that amount of ram required to uncompress
388          * an extent is reasonable, so we limit the total size in ram
389          * of a compressed extent to 128k.  This is a crucial number
390          * because it also controls how easily we can spread reads across
391          * cpus for decompression.
392          *
393          * We also want to make sure the amount of IO required to do
394          * a random read is reasonably small, so we limit the size of
395          * a compressed extent to 128k.
396          */
397         total_compressed = min(total_compressed, max_uncompressed);
398         num_bytes = ALIGN(end - start + 1, blocksize);
399         num_bytes = max(blocksize,  num_bytes);
400         total_in = 0;
401         ret = 0;
402
403         /*
404          * we do compression for mount -o compress and when the
405          * inode has not been flagged as nocompress.  This flag can
406          * change at any time if we discover bad compression ratios.
407          */
408         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
409             (btrfs_test_opt(root, COMPRESS) ||
410              (BTRFS_I(inode)->force_compress) ||
411              (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
412                 WARN_ON(pages);
413                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
414                 if (!pages) {
415                         /* just bail out to the uncompressed code */
416                         goto cont;
417                 }
418
419                 if (BTRFS_I(inode)->force_compress)
420                         compress_type = BTRFS_I(inode)->force_compress;
421
422                 /*
423                  * we need to call clear_page_dirty_for_io on each
424                  * page in the range.  Otherwise applications with the file
425                  * mmap'd can wander in and change the page contents while
426                  * we are compressing them.
427                  *
428                  * If the compression fails for any reason, we set the pages
429                  * dirty again later on.
430                  */
431                 extent_range_clear_dirty_for_io(inode, start, end);
432                 redirty = 1;
433                 ret = btrfs_compress_pages(compress_type,
434                                            inode->i_mapping, start,
435                                            total_compressed, pages,
436                                            nr_pages, &nr_pages_ret,
437                                            &total_in,
438                                            &total_compressed,
439                                            max_compressed);
440
441                 if (!ret) {
442                         unsigned long offset = total_compressed &
443                                 (PAGE_CACHE_SIZE - 1);
444                         struct page *page = pages[nr_pages_ret - 1];
445                         char *kaddr;
446
447                         /* zero the tail end of the last page, we might be
448                          * sending it down to disk
449                          */
450                         if (offset) {
451                                 kaddr = kmap_atomic(page);
452                                 memset(kaddr + offset, 0,
453                                        PAGE_CACHE_SIZE - offset);
454                                 kunmap_atomic(kaddr);
455                         }
456                         will_compress = 1;
457                 }
458         }
459 cont:
460         if (start == 0) {
461                 trans = btrfs_join_transaction(root);
462                 if (IS_ERR(trans)) {
463                         ret = PTR_ERR(trans);
464                         trans = NULL;
465                         goto cleanup_and_out;
466                 }
467                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
468
469                 /* lets try to make an inline extent */
470                 if (ret || total_in < (actual_end - start)) {
471                         /* we didn't compress the entire range, try
472                          * to make an uncompressed inline extent.
473                          */
474                         ret = cow_file_range_inline(trans, root, inode,
475                                                     start, end, 0, 0, NULL);
476                 } else {
477                         /* try making a compressed inline extent */
478                         ret = cow_file_range_inline(trans, root, inode,
479                                                     start, end,
480                                                     total_compressed,
481                                                     compress_type, pages);
482                 }
483                 if (ret <= 0) {
484                         /*
485                          * inline extent creation worked or returned error,
486                          * we don't need to create any more async work items.
487                          * Unlock and free up our temp pages.
488                          */
489                         extent_clear_unlock_delalloc(inode,
490                              &BTRFS_I(inode)->io_tree,
491                              start, end, NULL,
492                              EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
493                              EXTENT_CLEAR_DELALLOC |
494                              EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
495
496                         btrfs_end_transaction(trans, root);
497                         goto free_pages_out;
498                 }
499                 btrfs_end_transaction(trans, root);
500         }
501
502         if (will_compress) {
503                 /*
504                  * we aren't doing an inline extent round the compressed size
505                  * up to a block size boundary so the allocator does sane
506                  * things
507                  */
508                 total_compressed = ALIGN(total_compressed, blocksize);
509
510                 /*
511                  * one last check to make sure the compression is really a
512                  * win, compare the page count read with the blocks on disk
513                  */
514                 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
515                 if (total_compressed >= total_in) {
516                         will_compress = 0;
517                 } else {
518                         num_bytes = total_in;
519                 }
520         }
521         if (!will_compress && pages) {
522                 /*
523                  * the compression code ran but failed to make things smaller,
524                  * free any pages it allocated and our page pointer array
525                  */
526                 for (i = 0; i < nr_pages_ret; i++) {
527                         WARN_ON(pages[i]->mapping);
528                         page_cache_release(pages[i]);
529                 }
530                 kfree(pages);
531                 pages = NULL;
532                 total_compressed = 0;
533                 nr_pages_ret = 0;
534
535                 /* flag the file so we don't compress in the future */
536                 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
537                     !(BTRFS_I(inode)->force_compress)) {
538                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
539                 }
540         }
541         if (will_compress) {
542                 *num_added += 1;
543
544                 /* the async work queues will take care of doing actual
545                  * allocation on disk for these compressed pages,
546                  * and will submit them to the elevator.
547                  */
548                 add_async_extent(async_cow, start, num_bytes,
549                                  total_compressed, pages, nr_pages_ret,
550                                  compress_type);
551
552                 if (start + num_bytes < end) {
553                         start += num_bytes;
554                         pages = NULL;
555                         cond_resched();
556                         goto again;
557                 }
558         } else {
559 cleanup_and_bail_uncompressed:
560                 /*
561                  * No compression, but we still need to write the pages in
562                  * the file we've been given so far.  redirty the locked
563                  * page if it corresponds to our extent and set things up
564                  * for the async work queue to run cow_file_range to do
565                  * the normal delalloc dance
566                  */
567                 if (page_offset(locked_page) >= start &&
568                     page_offset(locked_page) <= end) {
569                         __set_page_dirty_nobuffers(locked_page);
570                         /* unlocked later on in the async handlers */
571                 }
572                 if (redirty)
573                         extent_range_redirty_for_io(inode, start, end);
574                 add_async_extent(async_cow, start, end - start + 1,
575                                  0, NULL, 0, BTRFS_COMPRESS_NONE);
576                 *num_added += 1;
577         }
578
579 out:
580         return ret;
581
582 free_pages_out:
583         for (i = 0; i < nr_pages_ret; i++) {
584                 WARN_ON(pages[i]->mapping);
585                 page_cache_release(pages[i]);
586         }
587         kfree(pages);
588
589         goto out;
590
591 cleanup_and_out:
592         extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
593                                      start, end, NULL,
594                                      EXTENT_CLEAR_UNLOCK_PAGE |
595                                      EXTENT_CLEAR_DIRTY |
596                                      EXTENT_CLEAR_DELALLOC |
597                                      EXTENT_SET_WRITEBACK |
598                                      EXTENT_END_WRITEBACK);
599         if (!trans || IS_ERR(trans))
600                 btrfs_error(root->fs_info, ret, "Failed to join transaction");
601         else
602                 btrfs_abort_transaction(trans, root, ret);
603         goto free_pages_out;
604 }
605
606 /*
607  * phase two of compressed writeback.  This is the ordered portion
608  * of the code, which only gets called in the order the work was
609  * queued.  We walk all the async extents created by compress_file_range
610  * and send them down to the disk.
611  */
612 static noinline int submit_compressed_extents(struct inode *inode,
613                                               struct async_cow *async_cow)
614 {
615         struct async_extent *async_extent;
616         u64 alloc_hint = 0;
617         struct btrfs_trans_handle *trans;
618         struct btrfs_key ins;
619         struct extent_map *em;
620         struct btrfs_root *root = BTRFS_I(inode)->root;
621         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
622         struct extent_io_tree *io_tree;
623         int ret = 0;
624
625         if (list_empty(&async_cow->extents))
626                 return 0;
627
628 again:
629         while (!list_empty(&async_cow->extents)) {
630                 async_extent = list_entry(async_cow->extents.next,
631                                           struct async_extent, list);
632                 list_del(&async_extent->list);
633
634                 io_tree = &BTRFS_I(inode)->io_tree;
635
636 retry:
637                 /* did the compression code fall back to uncompressed IO? */
638                 if (!async_extent->pages) {
639                         int page_started = 0;
640                         unsigned long nr_written = 0;
641
642                         lock_extent(io_tree, async_extent->start,
643                                          async_extent->start +
644                                          async_extent->ram_size - 1);
645
646                         /* allocate blocks */
647                         ret = cow_file_range(inode, async_cow->locked_page,
648                                              async_extent->start,
649                                              async_extent->start +
650                                              async_extent->ram_size - 1,
651                                              &page_started, &nr_written, 0);
652
653                         /* JDM XXX */
654
655                         /*
656                          * if page_started, cow_file_range inserted an
657                          * inline extent and took care of all the unlocking
658                          * and IO for us.  Otherwise, we need to submit
659                          * all those pages down to the drive.
660                          */
661                         if (!page_started && !ret)
662                                 extent_write_locked_range(io_tree,
663                                                   inode, async_extent->start,
664                                                   async_extent->start +
665                                                   async_extent->ram_size - 1,
666                                                   btrfs_get_extent,
667                                                   WB_SYNC_ALL);
668                         else if (ret)
669                                 unlock_page(async_cow->locked_page);
670                         kfree(async_extent);
671                         cond_resched();
672                         continue;
673                 }
674
675                 lock_extent(io_tree, async_extent->start,
676                             async_extent->start + async_extent->ram_size - 1);
677
678                 trans = btrfs_join_transaction(root);
679                 if (IS_ERR(trans)) {
680                         ret = PTR_ERR(trans);
681                 } else {
682                         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
683                         ret = btrfs_reserve_extent(trans, root,
684                                            async_extent->compressed_size,
685                                            async_extent->compressed_size,
686                                            0, alloc_hint, &ins, 1);
687                         if (ret && ret != -ENOSPC)
688                                 btrfs_abort_transaction(trans, root, ret);
689                         btrfs_end_transaction(trans, root);
690                 }
691
692                 if (ret) {
693                         int i;
694
695                         for (i = 0; i < async_extent->nr_pages; i++) {
696                                 WARN_ON(async_extent->pages[i]->mapping);
697                                 page_cache_release(async_extent->pages[i]);
698                         }
699                         kfree(async_extent->pages);
700                         async_extent->nr_pages = 0;
701                         async_extent->pages = NULL;
702
703                         if (ret == -ENOSPC)
704                                 goto retry;
705                         goto out_free;
706                 }
707
708                 /*
709                  * here we're doing allocation and writeback of the
710                  * compressed pages
711                  */
712                 btrfs_drop_extent_cache(inode, async_extent->start,
713                                         async_extent->start +
714                                         async_extent->ram_size - 1, 0);
715
716                 em = alloc_extent_map();
717                 if (!em) {
718                         ret = -ENOMEM;
719                         goto out_free_reserve;
720                 }
721                 em->start = async_extent->start;
722                 em->len = async_extent->ram_size;
723                 em->orig_start = em->start;
724                 em->mod_start = em->start;
725                 em->mod_len = em->len;
726
727                 em->block_start = ins.objectid;
728                 em->block_len = ins.offset;
729                 em->orig_block_len = ins.offset;
730                 em->ram_bytes = async_extent->ram_size;
731                 em->bdev = root->fs_info->fs_devices->latest_bdev;
732                 em->compress_type = async_extent->compress_type;
733                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
734                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
735                 em->generation = -1;
736
737                 while (1) {
738                         write_lock(&em_tree->lock);
739                         ret = add_extent_mapping(em_tree, em, 1);
740                         write_unlock(&em_tree->lock);
741                         if (ret != -EEXIST) {
742                                 free_extent_map(em);
743                                 break;
744                         }
745                         btrfs_drop_extent_cache(inode, async_extent->start,
746                                                 async_extent->start +
747                                                 async_extent->ram_size - 1, 0);
748                 }
749
750                 if (ret)
751                         goto out_free_reserve;
752
753                 ret = btrfs_add_ordered_extent_compress(inode,
754                                                 async_extent->start,
755                                                 ins.objectid,
756                                                 async_extent->ram_size,
757                                                 ins.offset,
758                                                 BTRFS_ORDERED_COMPRESSED,
759                                                 async_extent->compress_type);
760                 if (ret)
761                         goto out_free_reserve;
762
763                 /*
764                  * clear dirty, set writeback and unlock the pages.
765                  */
766                 extent_clear_unlock_delalloc(inode,
767                                 &BTRFS_I(inode)->io_tree,
768                                 async_extent->start,
769                                 async_extent->start +
770                                 async_extent->ram_size - 1,
771                                 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
772                                 EXTENT_CLEAR_UNLOCK |
773                                 EXTENT_CLEAR_DELALLOC |
774                                 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
775
776                 ret = btrfs_submit_compressed_write(inode,
777                                     async_extent->start,
778                                     async_extent->ram_size,
779                                     ins.objectid,
780                                     ins.offset, async_extent->pages,
781                                     async_extent->nr_pages);
782                 alloc_hint = ins.objectid + ins.offset;
783                 kfree(async_extent);
784                 if (ret)
785                         goto out;
786                 cond_resched();
787         }
788         ret = 0;
789 out:
790         return ret;
791 out_free_reserve:
792         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
793 out_free:
794         extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
795                                      async_extent->start,
796                                      async_extent->start +
797                                      async_extent->ram_size - 1,
798                                      NULL, EXTENT_CLEAR_UNLOCK_PAGE |
799                                      EXTENT_CLEAR_UNLOCK |
800                                      EXTENT_CLEAR_DELALLOC |
801                                      EXTENT_CLEAR_DIRTY |
802                                      EXTENT_SET_WRITEBACK |
803                                      EXTENT_END_WRITEBACK);
804         kfree(async_extent);
805         goto again;
806 }
807
808 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
809                                       u64 num_bytes)
810 {
811         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
812         struct extent_map *em;
813         u64 alloc_hint = 0;
814
815         read_lock(&em_tree->lock);
816         em = search_extent_mapping(em_tree, start, num_bytes);
817         if (em) {
818                 /*
819                  * if block start isn't an actual block number then find the
820                  * first block in this inode and use that as a hint.  If that
821                  * block is also bogus then just don't worry about it.
822                  */
823                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
824                         free_extent_map(em);
825                         em = search_extent_mapping(em_tree, 0, 0);
826                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
827                                 alloc_hint = em->block_start;
828                         if (em)
829                                 free_extent_map(em);
830                 } else {
831                         alloc_hint = em->block_start;
832                         free_extent_map(em);
833                 }
834         }
835         read_unlock(&em_tree->lock);
836
837         return alloc_hint;
838 }
839
840 /*
841  * when extent_io.c finds a delayed allocation range in the file,
842  * the call backs end up in this code.  The basic idea is to
843  * allocate extents on disk for the range, and create ordered data structs
844  * in ram to track those extents.
845  *
846  * locked_page is the page that writepage had locked already.  We use
847  * it to make sure we don't do extra locks or unlocks.
848  *
849  * *page_started is set to one if we unlock locked_page and do everything
850  * required to start IO on it.  It may be clean and already done with
851  * IO when we return.
852  */
853 static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
854                                      struct inode *inode,
855                                      struct btrfs_root *root,
856                                      struct page *locked_page,
857                                      u64 start, u64 end, int *page_started,
858                                      unsigned long *nr_written,
859                                      int unlock)
860 {
861         u64 alloc_hint = 0;
862         u64 num_bytes;
863         unsigned long ram_size;
864         u64 disk_num_bytes;
865         u64 cur_alloc_size;
866         u64 blocksize = root->sectorsize;
867         struct btrfs_key ins;
868         struct extent_map *em;
869         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
870         int ret = 0;
871
872         BUG_ON(btrfs_is_free_space_inode(inode));
873
874         num_bytes = ALIGN(end - start + 1, blocksize);
875         num_bytes = max(blocksize,  num_bytes);
876         disk_num_bytes = num_bytes;
877
878         /* if this is a small write inside eof, kick off defrag */
879         if (num_bytes < 64 * 1024 &&
880             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
881                 btrfs_add_inode_defrag(trans, inode);
882
883         if (start == 0) {
884                 /* lets try to make an inline extent */
885                 ret = cow_file_range_inline(trans, root, inode,
886                                             start, end, 0, 0, NULL);
887                 if (ret == 0) {
888                         extent_clear_unlock_delalloc(inode,
889                                      &BTRFS_I(inode)->io_tree,
890                                      start, end, NULL,
891                                      EXTENT_CLEAR_UNLOCK_PAGE |
892                                      EXTENT_CLEAR_UNLOCK |
893                                      EXTENT_CLEAR_DELALLOC |
894                                      EXTENT_CLEAR_DIRTY |
895                                      EXTENT_SET_WRITEBACK |
896                                      EXTENT_END_WRITEBACK);
897
898                         *nr_written = *nr_written +
899                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
900                         *page_started = 1;
901                         goto out;
902                 } else if (ret < 0) {
903                         btrfs_abort_transaction(trans, root, ret);
904                         goto out_unlock;
905                 }
906         }
907
908         BUG_ON(disk_num_bytes >
909                btrfs_super_total_bytes(root->fs_info->super_copy));
910
911         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
912         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
913
914         while (disk_num_bytes > 0) {
915                 unsigned long op;
916
917                 cur_alloc_size = disk_num_bytes;
918                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
919                                            root->sectorsize, 0, alloc_hint,
920                                            &ins, 1);
921                 if (ret < 0) {
922                         btrfs_abort_transaction(trans, root, ret);
923                         goto out_unlock;
924                 }
925
926                 em = alloc_extent_map();
927                 if (!em) {
928                         ret = -ENOMEM;
929                         goto out_reserve;
930                 }
931                 em->start = start;
932                 em->orig_start = em->start;
933                 ram_size = ins.offset;
934                 em->len = ins.offset;
935                 em->mod_start = em->start;
936                 em->mod_len = em->len;
937
938                 em->block_start = ins.objectid;
939                 em->block_len = ins.offset;
940                 em->orig_block_len = ins.offset;
941                 em->ram_bytes = ram_size;
942                 em->bdev = root->fs_info->fs_devices->latest_bdev;
943                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
944                 em->generation = -1;
945
946                 while (1) {
947                         write_lock(&em_tree->lock);
948                         ret = add_extent_mapping(em_tree, em, 1);
949                         write_unlock(&em_tree->lock);
950                         if (ret != -EEXIST) {
951                                 free_extent_map(em);
952                                 break;
953                         }
954                         btrfs_drop_extent_cache(inode, start,
955                                                 start + ram_size - 1, 0);
956                 }
957                 if (ret)
958                         goto out_reserve;
959
960                 cur_alloc_size = ins.offset;
961                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
962                                                ram_size, cur_alloc_size, 0);
963                 if (ret)
964                         goto out_reserve;
965
966                 if (root->root_key.objectid ==
967                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
968                         ret = btrfs_reloc_clone_csums(inode, start,
969                                                       cur_alloc_size);
970                         if (ret) {
971                                 btrfs_abort_transaction(trans, root, ret);
972                                 goto out_reserve;
973                         }
974                 }
975
976                 if (disk_num_bytes < cur_alloc_size)
977                         break;
978
979                 /* we're not doing compressed IO, don't unlock the first
980                  * page (which the caller expects to stay locked), don't
981                  * clear any dirty bits and don't set any writeback bits
982                  *
983                  * Do set the Private2 bit so we know this page was properly
984                  * setup for writepage
985                  */
986                 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
987                 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
988                         EXTENT_SET_PRIVATE2;
989
990                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
991                                              start, start + ram_size - 1,
992                                              locked_page, op);
993                 disk_num_bytes -= cur_alloc_size;
994                 num_bytes -= cur_alloc_size;
995                 alloc_hint = ins.objectid + ins.offset;
996                 start += cur_alloc_size;
997         }
998 out:
999         return ret;
1000
1001 out_reserve:
1002         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
1003 out_unlock:
1004         extent_clear_unlock_delalloc(inode,
1005                      &BTRFS_I(inode)->io_tree,
1006                      start, end, locked_page,
1007                      EXTENT_CLEAR_UNLOCK_PAGE |
1008                      EXTENT_CLEAR_UNLOCK |
1009                      EXTENT_CLEAR_DELALLOC |
1010                      EXTENT_CLEAR_DIRTY |
1011                      EXTENT_SET_WRITEBACK |
1012                      EXTENT_END_WRITEBACK);
1013
1014         goto out;
1015 }
1016
1017 static noinline int cow_file_range(struct inode *inode,
1018                                    struct page *locked_page,
1019                                    u64 start, u64 end, int *page_started,
1020                                    unsigned long *nr_written,
1021                                    int unlock)
1022 {
1023         struct btrfs_trans_handle *trans;
1024         struct btrfs_root *root = BTRFS_I(inode)->root;
1025         int ret;
1026
1027         trans = btrfs_join_transaction(root);
1028         if (IS_ERR(trans)) {
1029                 extent_clear_unlock_delalloc(inode,
1030                              &BTRFS_I(inode)->io_tree,
1031                              start, end, locked_page,
1032                              EXTENT_CLEAR_UNLOCK_PAGE |
1033                              EXTENT_CLEAR_UNLOCK |
1034                              EXTENT_CLEAR_DELALLOC |
1035                              EXTENT_CLEAR_DIRTY |
1036                              EXTENT_SET_WRITEBACK |
1037                              EXTENT_END_WRITEBACK);
1038                 return PTR_ERR(trans);
1039         }
1040         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1041
1042         ret = __cow_file_range(trans, inode, root, locked_page, start, end,
1043                                page_started, nr_written, unlock);
1044
1045         btrfs_end_transaction(trans, root);
1046
1047         return ret;
1048 }
1049
1050 /*
1051  * work queue call back to started compression on a file and pages
1052  */
1053 static noinline void async_cow_start(struct btrfs_work *work)
1054 {
1055         struct async_cow *async_cow;
1056         int num_added = 0;
1057         async_cow = container_of(work, struct async_cow, work);
1058
1059         compress_file_range(async_cow->inode, async_cow->locked_page,
1060                             async_cow->start, async_cow->end, async_cow,
1061                             &num_added);
1062         if (num_added == 0) {
1063                 btrfs_add_delayed_iput(async_cow->inode);
1064                 async_cow->inode = NULL;
1065         }
1066 }
1067
1068 /*
1069  * work queue call back to submit previously compressed pages
1070  */
1071 static noinline void async_cow_submit(struct btrfs_work *work)
1072 {
1073         struct async_cow *async_cow;
1074         struct btrfs_root *root;
1075         unsigned long nr_pages;
1076
1077         async_cow = container_of(work, struct async_cow, work);
1078
1079         root = async_cow->root;
1080         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1081                 PAGE_CACHE_SHIFT;
1082
1083         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1084             5 * 1024 * 1024 &&
1085             waitqueue_active(&root->fs_info->async_submit_wait))
1086                 wake_up(&root->fs_info->async_submit_wait);
1087
1088         if (async_cow->inode)
1089                 submit_compressed_extents(async_cow->inode, async_cow);
1090 }
1091
1092 static noinline void async_cow_free(struct btrfs_work *work)
1093 {
1094         struct async_cow *async_cow;
1095         async_cow = container_of(work, struct async_cow, work);
1096         if (async_cow->inode)
1097                 btrfs_add_delayed_iput(async_cow->inode);
1098         kfree(async_cow);
1099 }
1100
1101 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1102                                 u64 start, u64 end, int *page_started,
1103                                 unsigned long *nr_written)
1104 {
1105         struct async_cow *async_cow;
1106         struct btrfs_root *root = BTRFS_I(inode)->root;
1107         unsigned long nr_pages;
1108         u64 cur_end;
1109         int limit = 10 * 1024 * 1024;
1110
1111         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1112                          1, 0, NULL, GFP_NOFS);
1113         while (start < end) {
1114                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1115                 BUG_ON(!async_cow); /* -ENOMEM */
1116                 async_cow->inode = igrab(inode);
1117                 async_cow->root = root;
1118                 async_cow->locked_page = locked_page;
1119                 async_cow->start = start;
1120
1121                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1122                         cur_end = end;
1123                 else
1124                         cur_end = min(end, start + 512 * 1024 - 1);
1125
1126                 async_cow->end = cur_end;
1127                 INIT_LIST_HEAD(&async_cow->extents);
1128
1129                 async_cow->work.func = async_cow_start;
1130                 async_cow->work.ordered_func = async_cow_submit;
1131                 async_cow->work.ordered_free = async_cow_free;
1132                 async_cow->work.flags = 0;
1133
1134                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1135                         PAGE_CACHE_SHIFT;
1136                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1137
1138                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
1139                                    &async_cow->work);
1140
1141                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1142                         wait_event(root->fs_info->async_submit_wait,
1143                            (atomic_read(&root->fs_info->async_delalloc_pages) <
1144                             limit));
1145                 }
1146
1147                 while (atomic_read(&root->fs_info->async_submit_draining) &&
1148                       atomic_read(&root->fs_info->async_delalloc_pages)) {
1149                         wait_event(root->fs_info->async_submit_wait,
1150                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
1151                            0));
1152                 }
1153
1154                 *nr_written += nr_pages;
1155                 start = cur_end + 1;
1156         }
1157         *page_started = 1;
1158         return 0;
1159 }
1160
1161 static noinline int csum_exist_in_range(struct btrfs_root *root,
1162                                         u64 bytenr, u64 num_bytes)
1163 {
1164         int ret;
1165         struct btrfs_ordered_sum *sums;
1166         LIST_HEAD(list);
1167
1168         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1169                                        bytenr + num_bytes - 1, &list, 0);
1170         if (ret == 0 && list_empty(&list))
1171                 return 0;
1172
1173         while (!list_empty(&list)) {
1174                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1175                 list_del(&sums->list);
1176                 kfree(sums);
1177         }
1178         return 1;
1179 }
1180
1181 /*
1182  * when nowcow writeback call back.  This checks for snapshots or COW copies
1183  * of the extents that exist in the file, and COWs the file as required.
1184  *
1185  * If no cow copies or snapshots exist, we write directly to the existing
1186  * blocks on disk
1187  */
1188 static noinline int run_delalloc_nocow(struct inode *inode,
1189                                        struct page *locked_page,
1190                               u64 start, u64 end, int *page_started, int force,
1191                               unsigned long *nr_written)
1192 {
1193         struct btrfs_root *root = BTRFS_I(inode)->root;
1194         struct btrfs_trans_handle *trans;
1195         struct extent_buffer *leaf;
1196         struct btrfs_path *path;
1197         struct btrfs_file_extent_item *fi;
1198         struct btrfs_key found_key;
1199         u64 cow_start;
1200         u64 cur_offset;
1201         u64 extent_end;
1202         u64 extent_offset;
1203         u64 disk_bytenr;
1204         u64 num_bytes;
1205         u64 disk_num_bytes;
1206         u64 ram_bytes;
1207         int extent_type;
1208         int ret, err;
1209         int type;
1210         int nocow;
1211         int check_prev = 1;
1212         bool nolock;
1213         u64 ino = btrfs_ino(inode);
1214
1215         path = btrfs_alloc_path();
1216         if (!path) {
1217                 extent_clear_unlock_delalloc(inode,
1218                              &BTRFS_I(inode)->io_tree,
1219                              start, end, locked_page,
1220                              EXTENT_CLEAR_UNLOCK_PAGE |
1221                              EXTENT_CLEAR_UNLOCK |
1222                              EXTENT_CLEAR_DELALLOC |
1223                              EXTENT_CLEAR_DIRTY |
1224                              EXTENT_SET_WRITEBACK |
1225                              EXTENT_END_WRITEBACK);
1226                 return -ENOMEM;
1227         }
1228
1229         nolock = btrfs_is_free_space_inode(inode);
1230
1231         if (nolock)
1232                 trans = btrfs_join_transaction_nolock(root);
1233         else
1234                 trans = btrfs_join_transaction(root);
1235
1236         if (IS_ERR(trans)) {
1237                 extent_clear_unlock_delalloc(inode,
1238                              &BTRFS_I(inode)->io_tree,
1239                              start, end, locked_page,
1240                              EXTENT_CLEAR_UNLOCK_PAGE |
1241                              EXTENT_CLEAR_UNLOCK |
1242                              EXTENT_CLEAR_DELALLOC |
1243                              EXTENT_CLEAR_DIRTY |
1244                              EXTENT_SET_WRITEBACK |
1245                              EXTENT_END_WRITEBACK);
1246                 btrfs_free_path(path);
1247                 return PTR_ERR(trans);
1248         }
1249
1250         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1251
1252         cow_start = (u64)-1;
1253         cur_offset = start;
1254         while (1) {
1255                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1256                                                cur_offset, 0);
1257                 if (ret < 0) {
1258                         btrfs_abort_transaction(trans, root, ret);
1259                         goto error;
1260                 }
1261                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1262                         leaf = path->nodes[0];
1263                         btrfs_item_key_to_cpu(leaf, &found_key,
1264                                               path->slots[0] - 1);
1265                         if (found_key.objectid == ino &&
1266                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1267                                 path->slots[0]--;
1268                 }
1269                 check_prev = 0;
1270 next_slot:
1271                 leaf = path->nodes[0];
1272                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1273                         ret = btrfs_next_leaf(root, path);
1274                         if (ret < 0) {
1275                                 btrfs_abort_transaction(trans, root, ret);
1276                                 goto error;
1277                         }
1278                         if (ret > 0)
1279                                 break;
1280                         leaf = path->nodes[0];
1281                 }
1282
1283                 nocow = 0;
1284                 disk_bytenr = 0;
1285                 num_bytes = 0;
1286                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1287
1288                 if (found_key.objectid > ino ||
1289                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1290                     found_key.offset > end)
1291                         break;
1292
1293                 if (found_key.offset > cur_offset) {
1294                         extent_end = found_key.offset;
1295                         extent_type = 0;
1296                         goto out_check;
1297                 }
1298
1299                 fi = btrfs_item_ptr(leaf, path->slots[0],
1300                                     struct btrfs_file_extent_item);
1301                 extent_type = btrfs_file_extent_type(leaf, fi);
1302
1303                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1304                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1305                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1306                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1307                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1308                         extent_end = found_key.offset +
1309                                 btrfs_file_extent_num_bytes(leaf, fi);
1310                         disk_num_bytes =
1311                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1312                         if (extent_end <= start) {
1313                                 path->slots[0]++;
1314                                 goto next_slot;
1315                         }
1316                         if (disk_bytenr == 0)
1317                                 goto out_check;
1318                         if (btrfs_file_extent_compression(leaf, fi) ||
1319                             btrfs_file_extent_encryption(leaf, fi) ||
1320                             btrfs_file_extent_other_encoding(leaf, fi))
1321                                 goto out_check;
1322                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1323                                 goto out_check;
1324                         if (btrfs_extent_readonly(root, disk_bytenr))
1325                                 goto out_check;
1326                         if (btrfs_cross_ref_exist(trans, root, ino,
1327                                                   found_key.offset -
1328                                                   extent_offset, disk_bytenr))
1329                                 goto out_check;
1330                         disk_bytenr += extent_offset;
1331                         disk_bytenr += cur_offset - found_key.offset;
1332                         num_bytes = min(end + 1, extent_end) - cur_offset;
1333                         /*
1334                          * force cow if csum exists in the range.
1335                          * this ensure that csum for a given extent are
1336                          * either valid or do not exist.
1337                          */
1338                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1339                                 goto out_check;
1340                         nocow = 1;
1341                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1342                         extent_end = found_key.offset +
1343                                 btrfs_file_extent_inline_len(leaf, fi);
1344                         extent_end = ALIGN(extent_end, root->sectorsize);
1345                 } else {
1346                         BUG_ON(1);
1347                 }
1348 out_check:
1349                 if (extent_end <= start) {
1350                         path->slots[0]++;
1351                         goto next_slot;
1352                 }
1353                 if (!nocow) {
1354                         if (cow_start == (u64)-1)
1355                                 cow_start = cur_offset;
1356                         cur_offset = extent_end;
1357                         if (cur_offset > end)
1358                                 break;
1359                         path->slots[0]++;
1360                         goto next_slot;
1361                 }
1362
1363                 btrfs_release_path(path);
1364                 if (cow_start != (u64)-1) {
1365                         ret = __cow_file_range(trans, inode, root, locked_page,
1366                                                cow_start, found_key.offset - 1,
1367                                                page_started, nr_written, 1);
1368                         if (ret) {
1369                                 btrfs_abort_transaction(trans, root, ret);
1370                                 goto error;
1371                         }
1372                         cow_start = (u64)-1;
1373                 }
1374
1375                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1376                         struct extent_map *em;
1377                         struct extent_map_tree *em_tree;
1378                         em_tree = &BTRFS_I(inode)->extent_tree;
1379                         em = alloc_extent_map();
1380                         BUG_ON(!em); /* -ENOMEM */
1381                         em->start = cur_offset;
1382                         em->orig_start = found_key.offset - extent_offset;
1383                         em->len = num_bytes;
1384                         em->block_len = num_bytes;
1385                         em->block_start = disk_bytenr;
1386                         em->orig_block_len = disk_num_bytes;
1387                         em->ram_bytes = ram_bytes;
1388                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1389                         em->mod_start = em->start;
1390                         em->mod_len = em->len;
1391                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1392                         set_bit(EXTENT_FLAG_FILLING, &em->flags);
1393                         em->generation = -1;
1394                         while (1) {
1395                                 write_lock(&em_tree->lock);
1396                                 ret = add_extent_mapping(em_tree, em, 1);
1397                                 write_unlock(&em_tree->lock);
1398                                 if (ret != -EEXIST) {
1399                                         free_extent_map(em);
1400                                         break;
1401                                 }
1402                                 btrfs_drop_extent_cache(inode, em->start,
1403                                                 em->start + em->len - 1, 0);
1404                         }
1405                         type = BTRFS_ORDERED_PREALLOC;
1406                 } else {
1407                         type = BTRFS_ORDERED_NOCOW;
1408                 }
1409
1410                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1411                                                num_bytes, num_bytes, type);
1412                 BUG_ON(ret); /* -ENOMEM */
1413
1414                 if (root->root_key.objectid ==
1415                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1416                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1417                                                       num_bytes);
1418                         if (ret) {
1419                                 btrfs_abort_transaction(trans, root, ret);
1420                                 goto error;
1421                         }
1422                 }
1423
1424                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1425                                 cur_offset, cur_offset + num_bytes - 1,
1426                                 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1427                                 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1428                                 EXTENT_SET_PRIVATE2);
1429                 cur_offset = extent_end;
1430                 if (cur_offset > end)
1431                         break;
1432         }
1433         btrfs_release_path(path);
1434
1435         if (cur_offset <= end && cow_start == (u64)-1) {
1436                 cow_start = cur_offset;
1437                 cur_offset = end;
1438         }
1439
1440         if (cow_start != (u64)-1) {
1441                 ret = __cow_file_range(trans, inode, root, locked_page,
1442                                        cow_start, end,
1443                                        page_started, nr_written, 1);
1444                 if (ret) {
1445                         btrfs_abort_transaction(trans, root, ret);
1446                         goto error;
1447                 }
1448         }
1449
1450 error:
1451         err = btrfs_end_transaction(trans, root);
1452         if (!ret)
1453                 ret = err;
1454
1455         if (ret && cur_offset < end)
1456                 extent_clear_unlock_delalloc(inode,
1457                              &BTRFS_I(inode)->io_tree,
1458                              cur_offset, end, locked_page,
1459                              EXTENT_CLEAR_UNLOCK_PAGE |
1460                              EXTENT_CLEAR_UNLOCK |
1461                              EXTENT_CLEAR_DELALLOC |
1462                              EXTENT_CLEAR_DIRTY |
1463                              EXTENT_SET_WRITEBACK |
1464                              EXTENT_END_WRITEBACK);
1465
1466         btrfs_free_path(path);
1467         return ret;
1468 }
1469
1470 /*
1471  * extent_io.c call back to do delayed allocation processing
1472  */
1473 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1474                               u64 start, u64 end, int *page_started,
1475                               unsigned long *nr_written)
1476 {
1477         int ret;
1478         struct btrfs_root *root = BTRFS_I(inode)->root;
1479
1480         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1481                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1482                                          page_started, 1, nr_written);
1483         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1484                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1485                                          page_started, 0, nr_written);
1486         } else if (!btrfs_test_opt(root, COMPRESS) &&
1487                    !(BTRFS_I(inode)->force_compress) &&
1488                    !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1489                 ret = cow_file_range(inode, locked_page, start, end,
1490                                       page_started, nr_written, 1);
1491         } else {
1492                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1493                         &BTRFS_I(inode)->runtime_flags);
1494                 ret = cow_file_range_async(inode, locked_page, start, end,
1495                                            page_started, nr_written);
1496         }
1497         return ret;
1498 }
1499
1500 static void btrfs_split_extent_hook(struct inode *inode,
1501                                     struct extent_state *orig, u64 split)
1502 {
1503         /* not delalloc, ignore it */
1504         if (!(orig->state & EXTENT_DELALLOC))
1505                 return;
1506
1507         spin_lock(&BTRFS_I(inode)->lock);
1508         BTRFS_I(inode)->outstanding_extents++;
1509         spin_unlock(&BTRFS_I(inode)->lock);
1510 }
1511
1512 /*
1513  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1514  * extents so we can keep track of new extents that are just merged onto old
1515  * extents, such as when we are doing sequential writes, so we can properly
1516  * account for the metadata space we'll need.
1517  */
1518 static void btrfs_merge_extent_hook(struct inode *inode,
1519                                     struct extent_state *new,
1520                                     struct extent_state *other)
1521 {
1522         /* not delalloc, ignore it */
1523         if (!(other->state & EXTENT_DELALLOC))
1524                 return;
1525
1526         spin_lock(&BTRFS_I(inode)->lock);
1527         BTRFS_I(inode)->outstanding_extents--;
1528         spin_unlock(&BTRFS_I(inode)->lock);
1529 }
1530
1531 /*
1532  * extent_io.c set_bit_hook, used to track delayed allocation
1533  * bytes in this file, and to maintain the list of inodes that
1534  * have pending delalloc work to be done.
1535  */
1536 static void btrfs_set_bit_hook(struct inode *inode,
1537                                struct extent_state *state, unsigned long *bits)
1538 {
1539
1540         /*
1541          * set_bit and clear bit hooks normally require _irqsave/restore
1542          * but in this case, we are only testing for the DELALLOC
1543          * bit, which is only set or cleared with irqs on
1544          */
1545         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1546                 struct btrfs_root *root = BTRFS_I(inode)->root;
1547                 u64 len = state->end + 1 - state->start;
1548                 bool do_list = !btrfs_is_free_space_inode(inode);
1549
1550                 if (*bits & EXTENT_FIRST_DELALLOC) {
1551                         *bits &= ~EXTENT_FIRST_DELALLOC;
1552                 } else {
1553                         spin_lock(&BTRFS_I(inode)->lock);
1554                         BTRFS_I(inode)->outstanding_extents++;
1555                         spin_unlock(&BTRFS_I(inode)->lock);
1556                 }
1557
1558                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1559                                      root->fs_info->delalloc_batch);
1560                 spin_lock(&BTRFS_I(inode)->lock);
1561                 BTRFS_I(inode)->delalloc_bytes += len;
1562                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1563                                          &BTRFS_I(inode)->runtime_flags)) {
1564                         spin_lock(&root->fs_info->delalloc_lock);
1565                         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1566                                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1567                                               &root->fs_info->delalloc_inodes);
1568                                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1569                                         &BTRFS_I(inode)->runtime_flags);
1570                         }
1571                         spin_unlock(&root->fs_info->delalloc_lock);
1572                 }
1573                 spin_unlock(&BTRFS_I(inode)->lock);
1574         }
1575 }
1576
1577 /*
1578  * extent_io.c clear_bit_hook, see set_bit_hook for why
1579  */
1580 static void btrfs_clear_bit_hook(struct inode *inode,
1581                                  struct extent_state *state,
1582                                  unsigned long *bits)
1583 {
1584         /*
1585          * set_bit and clear bit hooks normally require _irqsave/restore
1586          * but in this case, we are only testing for the DELALLOC
1587          * bit, which is only set or cleared with irqs on
1588          */
1589         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1590                 struct btrfs_root *root = BTRFS_I(inode)->root;
1591                 u64 len = state->end + 1 - state->start;
1592                 bool do_list = !btrfs_is_free_space_inode(inode);
1593
1594                 if (*bits & EXTENT_FIRST_DELALLOC) {
1595                         *bits &= ~EXTENT_FIRST_DELALLOC;
1596                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1597                         spin_lock(&BTRFS_I(inode)->lock);
1598                         BTRFS_I(inode)->outstanding_extents--;
1599                         spin_unlock(&BTRFS_I(inode)->lock);
1600                 }
1601
1602                 if (*bits & EXTENT_DO_ACCOUNTING)
1603                         btrfs_delalloc_release_metadata(inode, len);
1604
1605                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1606                     && do_list)
1607                         btrfs_free_reserved_data_space(inode, len);
1608
1609                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1610                                      root->fs_info->delalloc_batch);
1611                 spin_lock(&BTRFS_I(inode)->lock);
1612                 BTRFS_I(inode)->delalloc_bytes -= len;
1613                 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1614                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1615                              &BTRFS_I(inode)->runtime_flags)) {
1616                         spin_lock(&root->fs_info->delalloc_lock);
1617                         if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1618                                 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1619                                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1620                                           &BTRFS_I(inode)->runtime_flags);
1621                         }
1622                         spin_unlock(&root->fs_info->delalloc_lock);
1623                 }
1624                 spin_unlock(&BTRFS_I(inode)->lock);
1625         }
1626 }
1627
1628 /*
1629  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1630  * we don't create bios that span stripes or chunks
1631  */
1632 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1633                          size_t size, struct bio *bio,
1634                          unsigned long bio_flags)
1635 {
1636         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1637         u64 logical = (u64)bio->bi_sector << 9;
1638         u64 length = 0;
1639         u64 map_length;
1640         int ret;
1641
1642         if (bio_flags & EXTENT_BIO_COMPRESSED)
1643                 return 0;
1644
1645         length = bio->bi_size;
1646         map_length = length;
1647         ret = btrfs_map_block(root->fs_info, rw, logical,
1648                               &map_length, NULL, 0);
1649         /* Will always return 0 with map_multi == NULL */
1650         BUG_ON(ret < 0);
1651         if (map_length < length + size)
1652                 return 1;
1653         return 0;
1654 }
1655
1656 /*
1657  * in order to insert checksums into the metadata in large chunks,
1658  * we wait until bio submission time.   All the pages in the bio are
1659  * checksummed and sums are attached onto the ordered extent record.
1660  *
1661  * At IO completion time the cums attached on the ordered extent record
1662  * are inserted into the btree
1663  */
1664 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1665                                     struct bio *bio, int mirror_num,
1666                                     unsigned long bio_flags,
1667                                     u64 bio_offset)
1668 {
1669         struct btrfs_root *root = BTRFS_I(inode)->root;
1670         int ret = 0;
1671
1672         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1673         BUG_ON(ret); /* -ENOMEM */
1674         return 0;
1675 }
1676
1677 /*
1678  * in order to insert checksums into the metadata in large chunks,
1679  * we wait until bio submission time.   All the pages in the bio are
1680  * checksummed and sums are attached onto the ordered extent record.
1681  *
1682  * At IO completion time the cums attached on the ordered extent record
1683  * are inserted into the btree
1684  */
1685 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1686                           int mirror_num, unsigned long bio_flags,
1687                           u64 bio_offset)
1688 {
1689         struct btrfs_root *root = BTRFS_I(inode)->root;
1690         int ret;
1691
1692         ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1693         if (ret)
1694                 bio_endio(bio, ret);
1695         return ret;
1696 }
1697
1698 /*
1699  * extent_io.c submission hook. This does the right thing for csum calculation
1700  * on write, or reading the csums from the tree before a read
1701  */
1702 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1703                           int mirror_num, unsigned long bio_flags,
1704                           u64 bio_offset)
1705 {
1706         struct btrfs_root *root = BTRFS_I(inode)->root;
1707         int ret = 0;
1708         int skip_sum;
1709         int metadata = 0;
1710         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1711
1712         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1713
1714         if (btrfs_is_free_space_inode(inode))
1715                 metadata = 2;
1716
1717         if (!(rw & REQ_WRITE)) {
1718                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1719                 if (ret)
1720                         goto out;
1721
1722                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1723                         ret = btrfs_submit_compressed_read(inode, bio,
1724                                                            mirror_num,
1725                                                            bio_flags);
1726                         goto out;
1727                 } else if (!skip_sum) {
1728                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1729                         if (ret)
1730                                 goto out;
1731                 }
1732                 goto mapit;
1733         } else if (async && !skip_sum) {
1734                 /* csum items have already been cloned */
1735                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1736                         goto mapit;
1737                 /* we're doing a write, do the async checksumming */
1738                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1739                                    inode, rw, bio, mirror_num,
1740                                    bio_flags, bio_offset,
1741                                    __btrfs_submit_bio_start,
1742                                    __btrfs_submit_bio_done);
1743                 goto out;
1744         } else if (!skip_sum) {
1745                 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1746                 if (ret)
1747                         goto out;
1748         }
1749
1750 mapit:
1751         ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1752
1753 out:
1754         if (ret < 0)
1755                 bio_endio(bio, ret);
1756         return ret;
1757 }
1758
1759 /*
1760  * given a list of ordered sums record them in the inode.  This happens
1761  * at IO completion time based on sums calculated at bio submission time.
1762  */
1763 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1764                              struct inode *inode, u64 file_offset,
1765                              struct list_head *list)
1766 {
1767         struct btrfs_ordered_sum *sum;
1768
1769         list_for_each_entry(sum, list, list) {
1770                 trans->adding_csums = 1;
1771                 btrfs_csum_file_blocks(trans,
1772                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1773                 trans->adding_csums = 0;
1774         }
1775         return 0;
1776 }
1777
1778 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1779                               struct extent_state **cached_state)
1780 {
1781         WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1782         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1783                                    cached_state, GFP_NOFS);
1784 }
1785
1786 /* see btrfs_writepage_start_hook for details on why this is required */
1787 struct btrfs_writepage_fixup {
1788         struct page *page;
1789         struct btrfs_work work;
1790 };
1791
1792 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1793 {
1794         struct btrfs_writepage_fixup *fixup;
1795         struct btrfs_ordered_extent *ordered;
1796         struct extent_state *cached_state = NULL;
1797         struct page *page;
1798         struct inode *inode;
1799         u64 page_start;
1800         u64 page_end;
1801         int ret;
1802
1803         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1804         page = fixup->page;
1805 again:
1806         lock_page(page);
1807         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1808                 ClearPageChecked(page);
1809                 goto out_page;
1810         }
1811
1812         inode = page->mapping->host;
1813         page_start = page_offset(page);
1814         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1815
1816         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1817                          &cached_state);
1818
1819         /* already ordered? We're done */
1820         if (PagePrivate2(page))
1821                 goto out;
1822
1823         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1824         if (ordered) {
1825                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1826                                      page_end, &cached_state, GFP_NOFS);
1827                 unlock_page(page);
1828                 btrfs_start_ordered_extent(inode, ordered, 1);
1829                 btrfs_put_ordered_extent(ordered);
1830                 goto again;
1831         }
1832
1833         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1834         if (ret) {
1835                 mapping_set_error(page->mapping, ret);
1836                 end_extent_writepage(page, ret, page_start, page_end);
1837                 ClearPageChecked(page);
1838                 goto out;
1839          }
1840
1841         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1842         ClearPageChecked(page);
1843         set_page_dirty(page);
1844 out:
1845         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1846                              &cached_state, GFP_NOFS);
1847 out_page:
1848         unlock_page(page);
1849         page_cache_release(page);
1850         kfree(fixup);
1851 }
1852
1853 /*
1854  * There are a few paths in the higher layers of the kernel that directly
1855  * set the page dirty bit without asking the filesystem if it is a
1856  * good idea.  This causes problems because we want to make sure COW
1857  * properly happens and the data=ordered rules are followed.
1858  *
1859  * In our case any range that doesn't have the ORDERED bit set
1860  * hasn't been properly setup for IO.  We kick off an async process
1861  * to fix it up.  The async helper will wait for ordered extents, set
1862  * the delalloc bit and make it safe to write the page.
1863  */
1864 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1865 {
1866         struct inode *inode = page->mapping->host;
1867         struct btrfs_writepage_fixup *fixup;
1868         struct btrfs_root *root = BTRFS_I(inode)->root;
1869
1870         /* this page is properly in the ordered list */
1871         if (TestClearPagePrivate2(page))
1872                 return 0;
1873
1874         if (PageChecked(page))
1875                 return -EAGAIN;
1876
1877         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1878         if (!fixup)
1879                 return -EAGAIN;
1880
1881         SetPageChecked(page);
1882         page_cache_get(page);
1883         fixup->work.func = btrfs_writepage_fixup_worker;
1884         fixup->page = page;
1885         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1886         return -EBUSY;
1887 }
1888
1889 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1890                                        struct inode *inode, u64 file_pos,
1891                                        u64 disk_bytenr, u64 disk_num_bytes,
1892                                        u64 num_bytes, u64 ram_bytes,
1893                                        u8 compression, u8 encryption,
1894                                        u16 other_encoding, int extent_type)
1895 {
1896         struct btrfs_root *root = BTRFS_I(inode)->root;
1897         struct btrfs_file_extent_item *fi;
1898         struct btrfs_path *path;
1899         struct extent_buffer *leaf;
1900         struct btrfs_key ins;
1901         int ret;
1902
1903         path = btrfs_alloc_path();
1904         if (!path)
1905                 return -ENOMEM;
1906
1907         path->leave_spinning = 1;
1908
1909         /*
1910          * we may be replacing one extent in the tree with another.
1911          * The new extent is pinned in the extent map, and we don't want
1912          * to drop it from the cache until it is completely in the btree.
1913          *
1914          * So, tell btrfs_drop_extents to leave this extent in the cache.
1915          * the caller is expected to unpin it and allow it to be merged
1916          * with the others.
1917          */
1918         ret = btrfs_drop_extents(trans, root, inode, file_pos,
1919                                  file_pos + num_bytes, 0);
1920         if (ret)
1921                 goto out;
1922
1923         ins.objectid = btrfs_ino(inode);
1924         ins.offset = file_pos;
1925         ins.type = BTRFS_EXTENT_DATA_KEY;
1926         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1927         if (ret)
1928                 goto out;
1929         leaf = path->nodes[0];
1930         fi = btrfs_item_ptr(leaf, path->slots[0],
1931                             struct btrfs_file_extent_item);
1932         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1933         btrfs_set_file_extent_type(leaf, fi, extent_type);
1934         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1935         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1936         btrfs_set_file_extent_offset(leaf, fi, 0);
1937         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1938         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1939         btrfs_set_file_extent_compression(leaf, fi, compression);
1940         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1941         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1942
1943         btrfs_mark_buffer_dirty(leaf);
1944         btrfs_release_path(path);
1945
1946         inode_add_bytes(inode, num_bytes);
1947
1948         ins.objectid = disk_bytenr;
1949         ins.offset = disk_num_bytes;
1950         ins.type = BTRFS_EXTENT_ITEM_KEY;
1951         ret = btrfs_alloc_reserved_file_extent(trans, root,
1952                                         root->root_key.objectid,
1953                                         btrfs_ino(inode), file_pos, &ins);
1954 out:
1955         btrfs_free_path(path);
1956
1957         return ret;
1958 }
1959
1960 /* snapshot-aware defrag */
1961 struct sa_defrag_extent_backref {
1962         struct rb_node node;
1963         struct old_sa_defrag_extent *old;
1964         u64 root_id;
1965         u64 inum;
1966         u64 file_pos;
1967         u64 extent_offset;
1968         u64 num_bytes;
1969         u64 generation;
1970 };
1971
1972 struct old_sa_defrag_extent {
1973         struct list_head list;
1974         struct new_sa_defrag_extent *new;
1975
1976         u64 extent_offset;
1977         u64 bytenr;
1978         u64 offset;
1979         u64 len;
1980         int count;
1981 };
1982
1983 struct new_sa_defrag_extent {
1984         struct rb_root root;
1985         struct list_head head;
1986         struct btrfs_path *path;
1987         struct inode *inode;
1988         u64 file_pos;
1989         u64 len;
1990         u64 bytenr;
1991         u64 disk_len;
1992         u8 compress_type;
1993 };
1994
1995 static int backref_comp(struct sa_defrag_extent_backref *b1,
1996                         struct sa_defrag_extent_backref *b2)
1997 {
1998         if (b1->root_id < b2->root_id)
1999                 return -1;
2000         else if (b1->root_id > b2->root_id)
2001                 return 1;
2002
2003         if (b1->inum < b2->inum)
2004                 return -1;
2005         else if (b1->inum > b2->inum)
2006                 return 1;
2007
2008         if (b1->file_pos < b2->file_pos)
2009                 return -1;
2010         else if (b1->file_pos > b2->file_pos)
2011                 return 1;
2012
2013         /*
2014          * [------------------------------] ===> (a range of space)
2015          *     |<--->|   |<---->| =============> (fs/file tree A)
2016          * |<---------------------------->| ===> (fs/file tree B)
2017          *
2018          * A range of space can refer to two file extents in one tree while
2019          * refer to only one file extent in another tree.
2020          *
2021          * So we may process a disk offset more than one time(two extents in A)
2022          * and locate at the same extent(one extent in B), then insert two same
2023          * backrefs(both refer to the extent in B).
2024          */
2025         return 0;
2026 }
2027
2028 static void backref_insert(struct rb_root *root,
2029                            struct sa_defrag_extent_backref *backref)
2030 {
2031         struct rb_node **p = &root->rb_node;
2032         struct rb_node *parent = NULL;
2033         struct sa_defrag_extent_backref *entry;
2034         int ret;
2035
2036         while (*p) {
2037                 parent = *p;
2038                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2039
2040                 ret = backref_comp(backref, entry);
2041                 if (ret < 0)
2042                         p = &(*p)->rb_left;
2043                 else
2044                         p = &(*p)->rb_right;
2045         }
2046
2047         rb_link_node(&backref->node, parent, p);
2048         rb_insert_color(&backref->node, root);
2049 }
2050
2051 /*
2052  * Note the backref might has changed, and in this case we just return 0.
2053  */
2054 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2055                                        void *ctx)
2056 {
2057         struct btrfs_file_extent_item *extent;
2058         struct btrfs_fs_info *fs_info;
2059         struct old_sa_defrag_extent *old = ctx;
2060         struct new_sa_defrag_extent *new = old->new;
2061         struct btrfs_path *path = new->path;
2062         struct btrfs_key key;
2063         struct btrfs_root *root;
2064         struct sa_defrag_extent_backref *backref;
2065         struct extent_buffer *leaf;
2066         struct inode *inode = new->inode;
2067         int slot;
2068         int ret;
2069         u64 extent_offset;
2070         u64 num_bytes;
2071
2072         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2073             inum == btrfs_ino(inode))
2074                 return 0;
2075
2076         key.objectid = root_id;
2077         key.type = BTRFS_ROOT_ITEM_KEY;
2078         key.offset = (u64)-1;
2079
2080         fs_info = BTRFS_I(inode)->root->fs_info;
2081         root = btrfs_read_fs_root_no_name(fs_info, &key);
2082         if (IS_ERR(root)) {
2083                 if (PTR_ERR(root) == -ENOENT)
2084                         return 0;
2085                 WARN_ON(1);
2086                 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2087                          inum, offset, root_id);
2088                 return PTR_ERR(root);
2089         }
2090
2091         key.objectid = inum;
2092         key.type = BTRFS_EXTENT_DATA_KEY;
2093         if (offset > (u64)-1 << 32)
2094                 key.offset = 0;
2095         else
2096                 key.offset = offset;
2097
2098         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2099         if (ret < 0) {
2100                 WARN_ON(1);
2101                 return ret;
2102         }
2103
2104         while (1) {
2105                 cond_resched();
2106
2107                 leaf = path->nodes[0];
2108                 slot = path->slots[0];
2109
2110                 if (slot >= btrfs_header_nritems(leaf)) {
2111                         ret = btrfs_next_leaf(root, path);
2112                         if (ret < 0) {
2113                                 goto out;
2114                         } else if (ret > 0) {
2115                                 ret = 0;
2116                                 goto out;
2117                         }
2118                         continue;
2119                 }
2120
2121                 path->slots[0]++;
2122
2123                 btrfs_item_key_to_cpu(leaf, &key, slot);
2124
2125                 if (key.objectid > inum)
2126                         goto out;
2127
2128                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2129                         continue;
2130
2131                 extent = btrfs_item_ptr(leaf, slot,
2132                                         struct btrfs_file_extent_item);
2133
2134                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2135                         continue;
2136
2137                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2138                 if (key.offset - extent_offset != offset)
2139                         continue;
2140
2141                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2142                 if (extent_offset >= old->extent_offset + old->offset +
2143                     old->len || extent_offset + num_bytes <=
2144                     old->extent_offset + old->offset)
2145                         continue;
2146
2147                 break;
2148         }
2149
2150         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2151         if (!backref) {
2152                 ret = -ENOENT;
2153                 goto out;
2154         }
2155
2156         backref->root_id = root_id;
2157         backref->inum = inum;
2158         backref->file_pos = offset + extent_offset;
2159         backref->num_bytes = num_bytes;
2160         backref->extent_offset = extent_offset;
2161         backref->generation = btrfs_file_extent_generation(leaf, extent);
2162         backref->old = old;
2163         backref_insert(&new->root, backref);
2164         old->count++;
2165 out:
2166         btrfs_release_path(path);
2167         WARN_ON(ret);
2168         return ret;
2169 }
2170
2171 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2172                                    struct new_sa_defrag_extent *new)
2173 {
2174         struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2175         struct old_sa_defrag_extent *old, *tmp;
2176         int ret;
2177
2178         new->path = path;
2179
2180         list_for_each_entry_safe(old, tmp, &new->head, list) {
2181                 ret = iterate_inodes_from_logical(old->bytenr, fs_info,
2182                                                   path, record_one_backref,
2183                                                   old);
2184                 BUG_ON(ret < 0 && ret != -ENOENT);
2185
2186                 /* no backref to be processed for this extent */
2187                 if (!old->count) {
2188                         list_del(&old->list);
2189                         kfree(old);
2190                 }
2191         }
2192
2193         if (list_empty(&new->head))
2194                 return false;
2195
2196         return true;
2197 }
2198
2199 static int relink_is_mergable(struct extent_buffer *leaf,
2200                               struct btrfs_file_extent_item *fi,
2201                               u64 disk_bytenr)
2202 {
2203         if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
2204                 return 0;
2205
2206         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2207                 return 0;
2208
2209         if (btrfs_file_extent_compression(leaf, fi) ||
2210             btrfs_file_extent_encryption(leaf, fi) ||
2211             btrfs_file_extent_other_encoding(leaf, fi))
2212                 return 0;
2213
2214         return 1;
2215 }
2216
2217 /*
2218  * Note the backref might has changed, and in this case we just return 0.
2219  */
2220 static noinline int relink_extent_backref(struct btrfs_path *path,
2221                                  struct sa_defrag_extent_backref *prev,
2222                                  struct sa_defrag_extent_backref *backref)
2223 {
2224         struct btrfs_file_extent_item *extent;
2225         struct btrfs_file_extent_item *item;
2226         struct btrfs_ordered_extent *ordered;
2227         struct btrfs_trans_handle *trans;
2228         struct btrfs_fs_info *fs_info;
2229         struct btrfs_root *root;
2230         struct btrfs_key key;
2231         struct extent_buffer *leaf;
2232         struct old_sa_defrag_extent *old = backref->old;
2233         struct new_sa_defrag_extent *new = old->new;
2234         struct inode *src_inode = new->inode;
2235         struct inode *inode;
2236         struct extent_state *cached = NULL;
2237         int ret = 0;
2238         u64 start;
2239         u64 len;
2240         u64 lock_start;
2241         u64 lock_end;
2242         bool merge = false;
2243         int index;
2244
2245         if (prev && prev->root_id == backref->root_id &&
2246             prev->inum == backref->inum &&
2247             prev->file_pos + prev->num_bytes == backref->file_pos)
2248                 merge = true;
2249
2250         /* step 1: get root */
2251         key.objectid = backref->root_id;
2252         key.type = BTRFS_ROOT_ITEM_KEY;
2253         key.offset = (u64)-1;
2254
2255         fs_info = BTRFS_I(src_inode)->root->fs_info;
2256         index = srcu_read_lock(&fs_info->subvol_srcu);
2257
2258         root = btrfs_read_fs_root_no_name(fs_info, &key);
2259         if (IS_ERR(root)) {
2260                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2261                 if (PTR_ERR(root) == -ENOENT)
2262                         return 0;
2263                 return PTR_ERR(root);
2264         }
2265         if (btrfs_root_refs(&root->root_item) == 0) {
2266                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2267                 /* parse ENOENT to 0 */
2268                 return 0;
2269         }
2270
2271         /* step 2: get inode */
2272         key.objectid = backref->inum;
2273         key.type = BTRFS_INODE_ITEM_KEY;
2274         key.offset = 0;
2275
2276         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2277         if (IS_ERR(inode)) {
2278                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2279                 return 0;
2280         }
2281
2282         srcu_read_unlock(&fs_info->subvol_srcu, index);
2283
2284         /* step 3: relink backref */
2285         lock_start = backref->file_pos;
2286         lock_end = backref->file_pos + backref->num_bytes - 1;
2287         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2288                          0, &cached);
2289
2290         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2291         if (ordered) {
2292                 btrfs_put_ordered_extent(ordered);
2293                 goto out_unlock;
2294         }
2295
2296         trans = btrfs_join_transaction(root);
2297         if (IS_ERR(trans)) {
2298                 ret = PTR_ERR(trans);
2299                 goto out_unlock;
2300         }
2301
2302         key.objectid = backref->inum;
2303         key.type = BTRFS_EXTENT_DATA_KEY;
2304         key.offset = backref->file_pos;
2305
2306         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2307         if (ret < 0) {
2308                 goto out_free_path;
2309         } else if (ret > 0) {
2310                 ret = 0;
2311                 goto out_free_path;
2312         }
2313
2314         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2315                                 struct btrfs_file_extent_item);
2316
2317         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2318             backref->generation)
2319                 goto out_free_path;
2320
2321         btrfs_release_path(path);
2322
2323         start = backref->file_pos;
2324         if (backref->extent_offset < old->extent_offset + old->offset)
2325                 start += old->extent_offset + old->offset -
2326                          backref->extent_offset;
2327
2328         len = min(backref->extent_offset + backref->num_bytes,
2329                   old->extent_offset + old->offset + old->len);
2330         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2331
2332         ret = btrfs_drop_extents(trans, root, inode, start,
2333                                  start + len, 1);
2334         if (ret)
2335                 goto out_free_path;
2336 again:
2337         key.objectid = btrfs_ino(inode);
2338         key.type = BTRFS_EXTENT_DATA_KEY;
2339         key.offset = start;
2340
2341         path->leave_spinning = 1;
2342         if (merge) {
2343                 struct btrfs_file_extent_item *fi;
2344                 u64 extent_len;
2345                 struct btrfs_key found_key;
2346
2347                 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2348                 if (ret < 0)
2349                         goto out_free_path;
2350
2351                 path->slots[0]--;
2352                 leaf = path->nodes[0];
2353                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2354
2355                 fi = btrfs_item_ptr(leaf, path->slots[0],
2356                                     struct btrfs_file_extent_item);
2357                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2358
2359                 if (relink_is_mergable(leaf, fi, new->bytenr) &&
2360                     extent_len + found_key.offset == start) {
2361                         btrfs_set_file_extent_num_bytes(leaf, fi,
2362                                                         extent_len + len);
2363                         btrfs_mark_buffer_dirty(leaf);
2364                         inode_add_bytes(inode, len);
2365
2366                         ret = 1;
2367                         goto out_free_path;
2368                 } else {
2369                         merge = false;
2370                         btrfs_release_path(path);
2371                         goto again;
2372                 }
2373         }
2374
2375         ret = btrfs_insert_empty_item(trans, root, path, &key,
2376                                         sizeof(*extent));
2377         if (ret) {
2378                 btrfs_abort_transaction(trans, root, ret);
2379                 goto out_free_path;
2380         }
2381
2382         leaf = path->nodes[0];
2383         item = btrfs_item_ptr(leaf, path->slots[0],
2384                                 struct btrfs_file_extent_item);
2385         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2386         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2387         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2388         btrfs_set_file_extent_num_bytes(leaf, item, len);
2389         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2390         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2391         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2392         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2393         btrfs_set_file_extent_encryption(leaf, item, 0);
2394         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2395
2396         btrfs_mark_buffer_dirty(leaf);
2397         inode_add_bytes(inode, len);
2398         btrfs_release_path(path);
2399
2400         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2401                         new->disk_len, 0,
2402                         backref->root_id, backref->inum,
2403                         new->file_pos, 0);      /* start - extent_offset */
2404         if (ret) {
2405                 btrfs_abort_transaction(trans, root, ret);
2406                 goto out_free_path;
2407         }
2408
2409         ret = 1;
2410 out_free_path:
2411         btrfs_release_path(path);
2412         path->leave_spinning = 0;
2413         btrfs_end_transaction(trans, root);
2414 out_unlock:
2415         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2416                              &cached, GFP_NOFS);
2417         iput(inode);
2418         return ret;
2419 }
2420
2421 static void relink_file_extents(struct new_sa_defrag_extent *new)
2422 {
2423         struct btrfs_path *path;
2424         struct old_sa_defrag_extent *old, *tmp;
2425         struct sa_defrag_extent_backref *backref;
2426         struct sa_defrag_extent_backref *prev = NULL;
2427         struct inode *inode;
2428         struct btrfs_root *root;
2429         struct rb_node *node;
2430         int ret;
2431
2432         inode = new->inode;
2433         root = BTRFS_I(inode)->root;
2434
2435         path = btrfs_alloc_path();
2436         if (!path)
2437                 return;
2438
2439         if (!record_extent_backrefs(path, new)) {
2440                 btrfs_free_path(path);
2441                 goto out;
2442         }
2443         btrfs_release_path(path);
2444
2445         while (1) {
2446                 node = rb_first(&new->root);
2447                 if (!node)
2448                         break;
2449                 rb_erase(node, &new->root);
2450
2451                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2452
2453                 ret = relink_extent_backref(path, prev, backref);
2454                 WARN_ON(ret < 0);
2455
2456                 kfree(prev);
2457
2458                 if (ret == 1)
2459                         prev = backref;
2460                 else
2461                         prev = NULL;
2462                 cond_resched();
2463         }
2464         kfree(prev);
2465
2466         btrfs_free_path(path);
2467
2468         list_for_each_entry_safe(old, tmp, &new->head, list) {
2469                 list_del(&old->list);
2470                 kfree(old);
2471         }
2472 out:
2473         atomic_dec(&root->fs_info->defrag_running);
2474         wake_up(&root->fs_info->transaction_wait);
2475
2476         kfree(new);
2477 }
2478
2479 static struct new_sa_defrag_extent *
2480 record_old_file_extents(struct inode *inode,
2481                         struct btrfs_ordered_extent *ordered)
2482 {
2483         struct btrfs_root *root = BTRFS_I(inode)->root;
2484         struct btrfs_path *path;
2485         struct btrfs_key key;
2486         struct old_sa_defrag_extent *old, *tmp;
2487         struct new_sa_defrag_extent *new;
2488         int ret;
2489
2490         new = kmalloc(sizeof(*new), GFP_NOFS);
2491         if (!new)
2492                 return NULL;
2493
2494         new->inode = inode;
2495         new->file_pos = ordered->file_offset;
2496         new->len = ordered->len;
2497         new->bytenr = ordered->start;
2498         new->disk_len = ordered->disk_len;
2499         new->compress_type = ordered->compress_type;
2500         new->root = RB_ROOT;
2501         INIT_LIST_HEAD(&new->head);
2502
2503         path = btrfs_alloc_path();
2504         if (!path)
2505                 goto out_kfree;
2506
2507         key.objectid = btrfs_ino(inode);
2508         key.type = BTRFS_EXTENT_DATA_KEY;
2509         key.offset = new->file_pos;
2510
2511         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2512         if (ret < 0)
2513                 goto out_free_path;
2514         if (ret > 0 && path->slots[0] > 0)
2515                 path->slots[0]--;
2516
2517         /* find out all the old extents for the file range */
2518         while (1) {
2519                 struct btrfs_file_extent_item *extent;
2520                 struct extent_buffer *l;
2521                 int slot;
2522                 u64 num_bytes;
2523                 u64 offset;
2524                 u64 end;
2525                 u64 disk_bytenr;
2526                 u64 extent_offset;
2527
2528                 l = path->nodes[0];
2529                 slot = path->slots[0];
2530
2531                 if (slot >= btrfs_header_nritems(l)) {
2532                         ret = btrfs_next_leaf(root, path);
2533                         if (ret < 0)
2534                                 goto out_free_list;
2535                         else if (ret > 0)
2536                                 break;
2537                         continue;
2538                 }
2539
2540                 btrfs_item_key_to_cpu(l, &key, slot);
2541
2542                 if (key.objectid != btrfs_ino(inode))
2543                         break;
2544                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2545                         break;
2546                 if (key.offset >= new->file_pos + new->len)
2547                         break;
2548
2549                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2550
2551                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2552                 if (key.offset + num_bytes < new->file_pos)
2553                         goto next;
2554
2555                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2556                 if (!disk_bytenr)
2557                         goto next;
2558
2559                 extent_offset = btrfs_file_extent_offset(l, extent);
2560
2561                 old = kmalloc(sizeof(*old), GFP_NOFS);
2562                 if (!old)
2563                         goto out_free_list;
2564
2565                 offset = max(new->file_pos, key.offset);
2566                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2567
2568                 old->bytenr = disk_bytenr;
2569                 old->extent_offset = extent_offset;
2570                 old->offset = offset - key.offset;
2571                 old->len = end - offset;
2572                 old->new = new;
2573                 old->count = 0;
2574                 list_add_tail(&old->list, &new->head);
2575 next:
2576                 path->slots[0]++;
2577                 cond_resched();
2578         }
2579
2580         btrfs_free_path(path);
2581         atomic_inc(&root->fs_info->defrag_running);
2582
2583         return new;
2584
2585 out_free_list:
2586         list_for_each_entry_safe(old, tmp, &new->head, list) {
2587                 list_del(&old->list);
2588                 kfree(old);
2589         }
2590 out_free_path:
2591         btrfs_free_path(path);
2592 out_kfree:
2593         kfree(new);
2594         return NULL;
2595 }
2596
2597 /*
2598  * helper function for btrfs_finish_ordered_io, this
2599  * just reads in some of the csum leaves to prime them into ram
2600  * before we start the transaction.  It limits the amount of btree
2601  * reads required while inside the transaction.
2602  */
2603 /* as ordered data IO finishes, this gets called so we can finish
2604  * an ordered extent if the range of bytes in the file it covers are
2605  * fully written.
2606  */
2607 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2608 {
2609         struct inode *inode = ordered_extent->inode;
2610         struct btrfs_root *root = BTRFS_I(inode)->root;
2611         struct btrfs_trans_handle *trans = NULL;
2612         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2613         struct extent_state *cached_state = NULL;
2614         struct new_sa_defrag_extent *new = NULL;
2615         int compress_type = 0;
2616         int ret;
2617         bool nolock;
2618
2619         nolock = btrfs_is_free_space_inode(inode);
2620
2621         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2622                 ret = -EIO;
2623                 goto out;
2624         }
2625
2626         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2627                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2628                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2629                 if (nolock)
2630                         trans = btrfs_join_transaction_nolock(root);
2631                 else
2632                         trans = btrfs_join_transaction(root);
2633                 if (IS_ERR(trans)) {
2634                         ret = PTR_ERR(trans);
2635                         trans = NULL;
2636                         goto out;
2637                 }
2638                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2639                 ret = btrfs_update_inode_fallback(trans, root, inode);
2640                 if (ret) /* -ENOMEM or corruption */
2641                         btrfs_abort_transaction(trans, root, ret);
2642                 goto out;
2643         }
2644
2645         lock_extent_bits(io_tree, ordered_extent->file_offset,
2646                          ordered_extent->file_offset + ordered_extent->len - 1,
2647                          0, &cached_state);
2648
2649         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2650                         ordered_extent->file_offset + ordered_extent->len - 1,
2651                         EXTENT_DEFRAG, 1, cached_state);
2652         if (ret) {
2653                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2654                 if (last_snapshot >= BTRFS_I(inode)->generation)
2655                         /* the inode is shared */
2656                         new = record_old_file_extents(inode, ordered_extent);
2657
2658                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2659                         ordered_extent->file_offset + ordered_extent->len - 1,
2660                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2661         }
2662
2663         if (nolock)
2664                 trans = btrfs_join_transaction_nolock(root);
2665         else
2666                 trans = btrfs_join_transaction(root);
2667         if (IS_ERR(trans)) {
2668                 ret = PTR_ERR(trans);
2669                 trans = NULL;
2670                 goto out_unlock;
2671         }
2672         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2673
2674         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2675                 compress_type = ordered_extent->compress_type;
2676         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2677                 BUG_ON(compress_type);
2678                 ret = btrfs_mark_extent_written(trans, inode,
2679                                                 ordered_extent->file_offset,
2680                                                 ordered_extent->file_offset +
2681                                                 ordered_extent->len);
2682         } else {
2683                 BUG_ON(root == root->fs_info->tree_root);
2684                 ret = insert_reserved_file_extent(trans, inode,
2685                                                 ordered_extent->file_offset,
2686                                                 ordered_extent->start,
2687                                                 ordered_extent->disk_len,
2688                                                 ordered_extent->len,
2689                                                 ordered_extent->len,
2690                                                 compress_type, 0, 0,
2691                                                 BTRFS_FILE_EXTENT_REG);
2692         }
2693         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2694                            ordered_extent->file_offset, ordered_extent->len,
2695                            trans->transid);
2696         if (ret < 0) {
2697                 btrfs_abort_transaction(trans, root, ret);
2698                 goto out_unlock;
2699         }
2700
2701         add_pending_csums(trans, inode, ordered_extent->file_offset,
2702                           &ordered_extent->list);
2703
2704         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2705         ret = btrfs_update_inode_fallback(trans, root, inode);
2706         if (ret) { /* -ENOMEM or corruption */
2707                 btrfs_abort_transaction(trans, root, ret);
2708                 goto out_unlock;
2709         }
2710         ret = 0;
2711 out_unlock:
2712         unlock_extent_cached(io_tree, ordered_extent->file_offset,
2713                              ordered_extent->file_offset +
2714                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
2715 out:
2716         if (root != root->fs_info->tree_root)
2717                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2718         if (trans)
2719                 btrfs_end_transaction(trans, root);
2720
2721         if (ret) {
2722                 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2723                                       ordered_extent->file_offset +
2724                                       ordered_extent->len - 1, NULL, GFP_NOFS);
2725
2726                 /*
2727                  * If the ordered extent had an IOERR or something else went
2728                  * wrong we need to return the space for this ordered extent
2729                  * back to the allocator.
2730                  */
2731                 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2732                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2733                         btrfs_free_reserved_extent(root, ordered_extent->start,
2734                                                    ordered_extent->disk_len);
2735         }
2736
2737
2738         /*
2739          * This needs to be done to make sure anybody waiting knows we are done
2740          * updating everything for this ordered extent.
2741          */
2742         btrfs_remove_ordered_extent(inode, ordered_extent);
2743
2744         /* for snapshot-aware defrag */
2745         if (new)
2746                 relink_file_extents(new);
2747
2748         /* once for us */
2749         btrfs_put_ordered_extent(ordered_extent);
2750         /* once for the tree */
2751         btrfs_put_ordered_extent(ordered_extent);
2752
2753         return ret;
2754 }
2755
2756 static void finish_ordered_fn(struct btrfs_work *work)
2757 {
2758         struct btrfs_ordered_extent *ordered_extent;
2759         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2760         btrfs_finish_ordered_io(ordered_extent);
2761 }
2762
2763 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2764                                 struct extent_state *state, int uptodate)
2765 {
2766         struct inode *inode = page->mapping->host;
2767         struct btrfs_root *root = BTRFS_I(inode)->root;
2768         struct btrfs_ordered_extent *ordered_extent = NULL;
2769         struct btrfs_workers *workers;
2770
2771         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2772
2773         ClearPagePrivate2(page);
2774         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2775                                             end - start + 1, uptodate))
2776                 return 0;
2777
2778         ordered_extent->work.func = finish_ordered_fn;
2779         ordered_extent->work.flags = 0;
2780
2781         if (btrfs_is_free_space_inode(inode))
2782                 workers = &root->fs_info->endio_freespace_worker;
2783         else
2784                 workers = &root->fs_info->endio_write_workers;
2785         btrfs_queue_worker(workers, &ordered_extent->work);
2786
2787         return 0;
2788 }
2789
2790 /*
2791  * when reads are done, we need to check csums to verify the data is correct
2792  * if there's a match, we allow the bio to finish.  If not, the code in
2793  * extent_io.c will try to find good copies for us.
2794  */
2795 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2796                                struct extent_state *state, int mirror)
2797 {
2798         size_t offset = start - page_offset(page);
2799         struct inode *inode = page->mapping->host;
2800         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2801         char *kaddr;
2802         u64 private = ~(u32)0;
2803         int ret;
2804         struct btrfs_root *root = BTRFS_I(inode)->root;
2805         u32 csum = ~(u32)0;
2806         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2807                                       DEFAULT_RATELIMIT_BURST);
2808
2809         if (PageChecked(page)) {
2810                 ClearPageChecked(page);
2811                 goto good;
2812         }
2813
2814         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2815                 goto good;
2816
2817         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2818             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2819                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2820                                   GFP_NOFS);
2821                 return 0;
2822         }
2823
2824         if (state && state->start == start) {
2825                 private = state->private;
2826                 ret = 0;
2827         } else {
2828                 ret = get_state_private(io_tree, start, &private);
2829         }
2830         kaddr = kmap_atomic(page);
2831         if (ret)
2832                 goto zeroit;
2833
2834         csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2835         btrfs_csum_final(csum, (char *)&csum);
2836         if (csum != private)
2837                 goto zeroit;
2838
2839         kunmap_atomic(kaddr);
2840 good:
2841         return 0;
2842
2843 zeroit:
2844         if (__ratelimit(&_rs))
2845                 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu",
2846                         (unsigned long long)btrfs_ino(page->mapping->host),
2847                         (unsigned long long)start, csum,
2848                         (unsigned long long)private);
2849         memset(kaddr + offset, 1, end - start + 1);
2850         flush_dcache_page(page);
2851         kunmap_atomic(kaddr);
2852         if (private == 0)
2853                 return 0;
2854         return -EIO;
2855 }
2856
2857 struct delayed_iput {
2858         struct list_head list;
2859         struct inode *inode;
2860 };
2861
2862 /* JDM: If this is fs-wide, why can't we add a pointer to
2863  * btrfs_inode instead and avoid the allocation? */
2864 void btrfs_add_delayed_iput(struct inode *inode)
2865 {
2866         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2867         struct delayed_iput *delayed;
2868
2869         if (atomic_add_unless(&inode->i_count, -1, 1))
2870                 return;
2871
2872         delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2873         delayed->inode = inode;
2874
2875         spin_lock(&fs_info->delayed_iput_lock);
2876         list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2877         spin_unlock(&fs_info->delayed_iput_lock);
2878 }
2879
2880 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2881 {
2882         LIST_HEAD(list);
2883         struct btrfs_fs_info *fs_info = root->fs_info;
2884         struct delayed_iput *delayed;
2885         int empty;
2886
2887         spin_lock(&fs_info->delayed_iput_lock);
2888         empty = list_empty(&fs_info->delayed_iputs);
2889         spin_unlock(&fs_info->delayed_iput_lock);
2890         if (empty)
2891                 return;
2892
2893         spin_lock(&fs_info->delayed_iput_lock);
2894         list_splice_init(&fs_info->delayed_iputs, &list);
2895         spin_unlock(&fs_info->delayed_iput_lock);
2896
2897         while (!list_empty(&list)) {
2898                 delayed = list_entry(list.next, struct delayed_iput, list);
2899                 list_del(&delayed->list);
2900                 iput(delayed->inode);
2901                 kfree(delayed);
2902         }
2903 }
2904
2905 /*
2906  * This is called in transaction commit time. If there are no orphan
2907  * files in the subvolume, it removes orphan item and frees block_rsv
2908  * structure.
2909  */
2910 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2911                               struct btrfs_root *root)
2912 {
2913         struct btrfs_block_rsv *block_rsv;
2914         int ret;
2915
2916         if (atomic_read(&root->orphan_inodes) ||
2917             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2918                 return;
2919
2920         spin_lock(&root->orphan_lock);
2921         if (atomic_read(&root->orphan_inodes)) {
2922                 spin_unlock(&root->orphan_lock);
2923                 return;
2924         }
2925
2926         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2927                 spin_unlock(&root->orphan_lock);
2928                 return;
2929         }
2930
2931         block_rsv = root->orphan_block_rsv;
2932         root->orphan_block_rsv = NULL;
2933         spin_unlock(&root->orphan_lock);
2934
2935         if (root->orphan_item_inserted &&
2936             btrfs_root_refs(&root->root_item) > 0) {
2937                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2938                                             root->root_key.objectid);
2939                 BUG_ON(ret);
2940                 root->orphan_item_inserted = 0;
2941         }
2942
2943         if (block_rsv) {
2944                 WARN_ON(block_rsv->size > 0);
2945                 btrfs_free_block_rsv(root, block_rsv);
2946         }
2947 }
2948
2949 /*
2950  * This creates an orphan entry for the given inode in case something goes
2951  * wrong in the middle of an unlink/truncate.
2952  *
2953  * NOTE: caller of this function should reserve 5 units of metadata for
2954  *       this function.
2955  */
2956 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2957 {
2958         struct btrfs_root *root = BTRFS_I(inode)->root;
2959         struct btrfs_block_rsv *block_rsv = NULL;
2960         int reserve = 0;
2961         int insert = 0;
2962         int ret;
2963
2964         if (!root->orphan_block_rsv) {
2965                 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2966                 if (!block_rsv)
2967                         return -ENOMEM;
2968         }
2969
2970         spin_lock(&root->orphan_lock);
2971         if (!root->orphan_block_rsv) {
2972                 root->orphan_block_rsv = block_rsv;
2973         } else if (block_rsv) {
2974                 btrfs_free_block_rsv(root, block_rsv);
2975                 block_rsv = NULL;
2976         }
2977
2978         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2979                               &BTRFS_I(inode)->runtime_flags)) {
2980 #if 0
2981                 /*
2982                  * For proper ENOSPC handling, we should do orphan
2983                  * cleanup when mounting. But this introduces backward
2984                  * compatibility issue.
2985                  */
2986                 if (!xchg(&root->orphan_item_inserted, 1))
2987                         insert = 2;
2988                 else
2989                         insert = 1;
2990 #endif
2991                 insert = 1;
2992                 atomic_inc(&root->orphan_inodes);
2993         }
2994
2995         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2996                               &BTRFS_I(inode)->runtime_flags))
2997                 reserve = 1;
2998         spin_unlock(&root->orphan_lock);
2999
3000         /* grab metadata reservation from transaction handle */
3001         if (reserve) {
3002                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3003                 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3004         }
3005
3006         /* insert an orphan item to track this unlinked/truncated file */
3007         if (insert >= 1) {
3008                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3009                 if (ret && ret != -EEXIST) {
3010                         clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3011                                   &BTRFS_I(inode)->runtime_flags);
3012                         btrfs_abort_transaction(trans, root, ret);
3013                         return ret;
3014                 }
3015                 ret = 0;
3016         }
3017
3018         /* insert an orphan item to track subvolume contains orphan files */
3019         if (insert >= 2) {
3020                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3021                                                root->root_key.objectid);
3022                 if (ret && ret != -EEXIST) {
3023                         btrfs_abort_transaction(trans, root, ret);
3024                         return ret;
3025                 }
3026         }
3027         return 0;
3028 }
3029
3030 /*
3031  * We have done the truncate/delete so we can go ahead and remove the orphan
3032  * item for this particular inode.
3033  */
3034 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3035                             struct inode *inode)
3036 {
3037         struct btrfs_root *root = BTRFS_I(inode)->root;
3038         int delete_item = 0;
3039         int release_rsv = 0;
3040         int ret = 0;
3041
3042         spin_lock(&root->orphan_lock);
3043         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3044                                &BTRFS_I(inode)->runtime_flags))
3045                 delete_item = 1;
3046
3047         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3048                                &BTRFS_I(inode)->runtime_flags))
3049                 release_rsv = 1;
3050         spin_unlock(&root->orphan_lock);
3051
3052         if (trans && delete_item) {
3053                 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
3054                 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
3055         }
3056
3057         if (release_rsv) {
3058                 btrfs_orphan_release_metadata(inode);
3059                 atomic_dec(&root->orphan_inodes);
3060         }
3061
3062         return 0;
3063 }
3064
3065 /*
3066  * this cleans up any orphans that may be left on the list from the last use
3067  * of this root.
3068  */
3069 int btrfs_orphan_cleanup(struct btrfs_root *root)
3070 {
3071         struct btrfs_path *path;
3072         struct extent_buffer *leaf;
3073         struct btrfs_key key, found_key;
3074         struct btrfs_trans_handle *trans;
3075         struct inode *inode;
3076         u64 last_objectid = 0;
3077         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3078
3079         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3080                 return 0;
3081
3082         path = btrfs_alloc_path();
3083         if (!path) {
3084                 ret = -ENOMEM;
3085                 goto out;
3086         }
3087         path->reada = -1;
3088
3089         key.objectid = BTRFS_ORPHAN_OBJECTID;
3090         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3091         key.offset = (u64)-1;
3092
3093         while (1) {
3094                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3095                 if (ret < 0)
3096                         goto out;
3097
3098                 /*
3099                  * if ret == 0 means we found what we were searching for, which
3100                  * is weird, but possible, so only screw with path if we didn't
3101                  * find the key and see if we have stuff that matches
3102                  */
3103                 if (ret > 0) {
3104                         ret = 0;
3105                         if (path->slots[0] == 0)
3106                                 break;
3107                         path->slots[0]--;
3108                 }
3109
3110                 /* pull out the item */
3111                 leaf = path->nodes[0];
3112                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3113
3114                 /* make sure the item matches what we want */
3115                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3116                         break;
3117                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3118                         break;
3119
3120                 /* release the path since we're done with it */
3121                 btrfs_release_path(path);
3122
3123                 /*
3124                  * this is where we are basically btrfs_lookup, without the
3125                  * crossing root thing.  we store the inode number in the
3126                  * offset of the orphan item.
3127                  */
3128
3129                 if (found_key.offset == last_objectid) {
3130                         btrfs_err(root->fs_info,
3131                                 "Error removing orphan entry, stopping orphan cleanup");
3132                         ret = -EINVAL;
3133                         goto out;
3134                 }
3135
3136                 last_objectid = found_key.offset;
3137
3138                 found_key.objectid = found_key.offset;
3139                 found_key.type = BTRFS_INODE_ITEM_KEY;
3140                 found_key.offset = 0;
3141                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3142                 ret = PTR_RET(inode);
3143                 if (ret && ret != -ESTALE)
3144                         goto out;
3145
3146                 if (ret == -ESTALE && root == root->fs_info->tree_root) {
3147                         struct btrfs_root *dead_root;
3148                         struct btrfs_fs_info *fs_info = root->fs_info;
3149                         int is_dead_root = 0;
3150
3151                         /*
3152                          * this is an orphan in the tree root. Currently these
3153                          * could come from 2 sources:
3154                          *  a) a snapshot deletion in progress
3155                          *  b) a free space cache inode
3156                          * We need to distinguish those two, as the snapshot
3157                          * orphan must not get deleted.
3158                          * find_dead_roots already ran before us, so if this
3159                          * is a snapshot deletion, we should find the root
3160                          * in the dead_roots list
3161                          */
3162                         spin_lock(&fs_info->trans_lock);
3163                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3164                                             root_list) {
3165                                 if (dead_root->root_key.objectid ==
3166                                     found_key.objectid) {
3167                                         is_dead_root = 1;
3168                                         break;
3169                                 }
3170                         }
3171                         spin_unlock(&fs_info->trans_lock);
3172                         if (is_dead_root) {
3173                                 /* prevent this orphan from being found again */
3174                                 key.offset = found_key.objectid - 1;
3175                                 continue;
3176                         }
3177                 }
3178                 /*
3179                  * Inode is already gone but the orphan item is still there,
3180                  * kill the orphan item.
3181                  */
3182                 if (ret == -ESTALE) {
3183                         trans = btrfs_start_transaction(root, 1);
3184                         if (IS_ERR(trans)) {
3185                                 ret = PTR_ERR(trans);
3186                                 goto out;
3187                         }
3188                         btrfs_debug(root->fs_info, "auto deleting %Lu",
3189                                 found_key.objectid);
3190                         ret = btrfs_del_orphan_item(trans, root,
3191                                                     found_key.objectid);
3192                         BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
3193                         btrfs_end_transaction(trans, root);
3194                         continue;
3195                 }
3196
3197                 /*
3198                  * add this inode to the orphan list so btrfs_orphan_del does
3199                  * the proper thing when we hit it
3200                  */
3201                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3202                         &BTRFS_I(inode)->runtime_flags);
3203                 atomic_inc(&root->orphan_inodes);
3204
3205                 /* if we have links, this was a truncate, lets do that */
3206                 if (inode->i_nlink) {
3207                         if (!S_ISREG(inode->i_mode)) {
3208                                 WARN_ON(1);
3209                                 iput(inode);
3210                                 continue;
3211                         }
3212                         nr_truncate++;
3213
3214                         /* 1 for the orphan item deletion. */
3215                         trans = btrfs_start_transaction(root, 1);
3216                         if (IS_ERR(trans)) {
3217                                 ret = PTR_ERR(trans);
3218                                 goto out;
3219                         }
3220                         ret = btrfs_orphan_add(trans, inode);
3221                         btrfs_end_transaction(trans, root);
3222                         if (ret)
3223                                 goto out;
3224
3225                         ret = btrfs_truncate(inode);
3226                         if (ret)
3227                                 btrfs_orphan_del(NULL, inode);
3228                 } else {
3229                         nr_unlink++;
3230                 }
3231
3232                 /* this will do delete_inode and everything for us */
3233                 iput(inode);
3234                 if (ret)
3235                         goto out;
3236         }
3237         /* release the path since we're done with it */
3238         btrfs_release_path(path);
3239
3240         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3241
3242         if (root->orphan_block_rsv)
3243                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3244                                         (u64)-1);
3245
3246         if (root->orphan_block_rsv || root->orphan_item_inserted) {
3247                 trans = btrfs_join_transaction(root);
3248                 if (!IS_ERR(trans))
3249                         btrfs_end_transaction(trans, root);
3250         }
3251
3252         if (nr_unlink)
3253                 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3254         if (nr_truncate)
3255                 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3256
3257 out:
3258         if (ret)
3259                 btrfs_crit(root->fs_info,
3260                         "could not do orphan cleanup %d", ret);
3261         btrfs_free_path(path);
3262         return ret;
3263 }
3264