2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include <linux/btrfs.h>
43 #include <linux/blkdev.h>
44 #include <linux/posix_acl_xattr.h>
45 #include <linux/uio.h>
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "print-tree.h"
51 #include "ordered-data.h"
55 #include "compression.h"
57 #include "free-space-cache.h"
58 #include "inode-map.h"
64 struct btrfs_iget_args {
65 struct btrfs_key *location;
66 struct btrfs_root *root;
69 static const struct inode_operations btrfs_dir_inode_operations;
70 static const struct inode_operations btrfs_symlink_inode_operations;
71 static const struct inode_operations btrfs_dir_ro_inode_operations;
72 static const struct inode_operations btrfs_special_inode_operations;
73 static const struct inode_operations btrfs_file_inode_operations;
74 static const struct address_space_operations btrfs_aops;
75 static const struct address_space_operations btrfs_symlink_aops;
76 static const struct file_operations btrfs_dir_file_operations;
77 static struct extent_io_ops btrfs_extent_io_ops;
79 static struct kmem_cache *btrfs_inode_cachep;
80 static struct kmem_cache *btrfs_delalloc_work_cachep;
81 struct kmem_cache *btrfs_trans_handle_cachep;
82 struct kmem_cache *btrfs_transaction_cachep;
83 struct kmem_cache *btrfs_path_cachep;
84 struct kmem_cache *btrfs_free_space_cachep;
87 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
88 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
89 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
90 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
91 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
92 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
93 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
94 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
97 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
98 static int btrfs_truncate(struct inode *inode);
99 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
100 static noinline int cow_file_range(struct inode *inode,
101 struct page *locked_page,
102 u64 start, u64 end, int *page_started,
103 unsigned long *nr_written, int unlock);
104 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
105 u64 len, u64 orig_start,
106 u64 block_start, u64 block_len,
107 u64 orig_block_len, u64 ram_bytes,
110 static int btrfs_dirty_inode(struct inode *inode);
112 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
113 void btrfs_test_inode_set_ops(struct inode *inode)
115 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
119 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
120 struct inode *inode, struct inode *dir,
121 const struct qstr *qstr)
125 err = btrfs_init_acl(trans, inode, dir);
127 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
132 * this does all the hard work for inserting an inline extent into
133 * the btree. The caller should have done a btrfs_drop_extents so that
134 * no overlapping inline items exist in the btree
136 static int insert_inline_extent(struct btrfs_trans_handle *trans,
137 struct btrfs_path *path, int extent_inserted,
138 struct btrfs_root *root, struct inode *inode,
139 u64 start, size_t size, size_t compressed_size,
141 struct page **compressed_pages)
143 struct extent_buffer *leaf;
144 struct page *page = NULL;
147 struct btrfs_file_extent_item *ei;
150 size_t cur_size = size;
151 unsigned long offset;
153 if (compressed_size && compressed_pages)
154 cur_size = compressed_size;
156 inode_add_bytes(inode, size);
158 if (!extent_inserted) {
159 struct btrfs_key key;
162 key.objectid = btrfs_ino(inode);
164 key.type = BTRFS_EXTENT_DATA_KEY;
166 datasize = btrfs_file_extent_calc_inline_size(cur_size);
167 path->leave_spinning = 1;
168 ret = btrfs_insert_empty_item(trans, root, path, &key,
175 leaf = path->nodes[0];
176 ei = btrfs_item_ptr(leaf, path->slots[0],
177 struct btrfs_file_extent_item);
178 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
179 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
180 btrfs_set_file_extent_encryption(leaf, ei, 0);
181 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
182 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
183 ptr = btrfs_file_extent_inline_start(ei);
185 if (compress_type != BTRFS_COMPRESS_NONE) {
188 while (compressed_size > 0) {
189 cpage = compressed_pages[i];
190 cur_size = min_t(unsigned long, compressed_size,
193 kaddr = kmap_atomic(cpage);
194 write_extent_buffer(leaf, kaddr, ptr, cur_size);
195 kunmap_atomic(kaddr);
199 compressed_size -= cur_size;
201 btrfs_set_file_extent_compression(leaf, ei,
204 page = find_get_page(inode->i_mapping,
205 start >> PAGE_CACHE_SHIFT);
206 btrfs_set_file_extent_compression(leaf, ei, 0);
207 kaddr = kmap_atomic(page);
208 offset = start & (PAGE_CACHE_SIZE - 1);
209 write_extent_buffer(leaf, kaddr + offset, ptr, size);
210 kunmap_atomic(kaddr);
211 page_cache_release(page);
213 btrfs_mark_buffer_dirty(leaf);
214 btrfs_release_path(path);
217 * we're an inline extent, so nobody can
218 * extend the file past i_size without locking
219 * a page we already have locked.
221 * We must do any isize and inode updates
222 * before we unlock the pages. Otherwise we
223 * could end up racing with unlink.
225 BTRFS_I(inode)->disk_i_size = inode->i_size;
226 ret = btrfs_update_inode(trans, root, inode);
235 * conditionally insert an inline extent into the file. This
236 * does the checks required to make sure the data is small enough
237 * to fit as an inline extent.
239 static noinline int cow_file_range_inline(struct btrfs_root *root,
240 struct inode *inode, u64 start,
241 u64 end, size_t compressed_size,
243 struct page **compressed_pages)
245 struct btrfs_trans_handle *trans;
246 u64 isize = i_size_read(inode);
247 u64 actual_end = min(end + 1, isize);
248 u64 inline_len = actual_end - start;
249 u64 aligned_end = ALIGN(end, root->sectorsize);
250 u64 data_len = inline_len;
252 struct btrfs_path *path;
253 int extent_inserted = 0;
254 u32 extent_item_size;
257 data_len = compressed_size;
260 actual_end > PAGE_CACHE_SIZE ||
261 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
263 (actual_end & (root->sectorsize - 1)) == 0) ||
265 data_len > root->fs_info->max_inline) {
269 path = btrfs_alloc_path();
273 trans = btrfs_join_transaction(root);
275 btrfs_free_path(path);
276 return PTR_ERR(trans);
278 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
280 if (compressed_size && compressed_pages)
281 extent_item_size = btrfs_file_extent_calc_inline_size(
284 extent_item_size = btrfs_file_extent_calc_inline_size(
287 ret = __btrfs_drop_extents(trans, root, inode, path,
288 start, aligned_end, NULL,
289 1, 1, extent_item_size, &extent_inserted);
291 btrfs_abort_transaction(trans, root, ret);
295 if (isize > actual_end)
296 inline_len = min_t(u64, isize, actual_end);
297 ret = insert_inline_extent(trans, path, extent_inserted,
299 inline_len, compressed_size,
300 compress_type, compressed_pages);
301 if (ret && ret != -ENOSPC) {
302 btrfs_abort_transaction(trans, root, ret);
304 } else if (ret == -ENOSPC) {
309 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
310 btrfs_delalloc_release_metadata(inode, end + 1 - start);
311 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
313 btrfs_free_path(path);
314 btrfs_end_transaction(trans, root);
318 struct async_extent {
323 unsigned long nr_pages;
325 struct list_head list;
330 struct btrfs_root *root;
331 struct page *locked_page;
334 struct list_head extents;
335 struct btrfs_work work;
338 static noinline int add_async_extent(struct async_cow *cow,
339 u64 start, u64 ram_size,
342 unsigned long nr_pages,
345 struct async_extent *async_extent;
347 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
348 BUG_ON(!async_extent); /* -ENOMEM */
349 async_extent->start = start;
350 async_extent->ram_size = ram_size;
351 async_extent->compressed_size = compressed_size;
352 async_extent->pages = pages;
353 async_extent->nr_pages = nr_pages;
354 async_extent->compress_type = compress_type;
355 list_add_tail(&async_extent->list, &cow->extents);
359 static inline int inode_need_compress(struct inode *inode)
361 struct btrfs_root *root = BTRFS_I(inode)->root;
364 if (btrfs_test_opt(root, FORCE_COMPRESS))
366 /* bad compression ratios */
367 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
369 if (btrfs_test_opt(root, COMPRESS) ||
370 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
371 BTRFS_I(inode)->force_compress)
377 * we create compressed extents in two phases. The first
378 * phase compresses a range of pages that have already been
379 * locked (both pages and state bits are locked).
381 * This is done inside an ordered work queue, and the compression
382 * is spread across many cpus. The actual IO submission is step
383 * two, and the ordered work queue takes care of making sure that
384 * happens in the same order things were put onto the queue by
385 * writepages and friends.
387 * If this code finds it can't get good compression, it puts an
388 * entry onto the work queue to write the uncompressed bytes. This
389 * makes sure that both compressed inodes and uncompressed inodes
390 * are written in the same order that the flusher thread sent them
393 static noinline void compress_file_range(struct inode *inode,
394 struct page *locked_page,
396 struct async_cow *async_cow,
399 struct btrfs_root *root = BTRFS_I(inode)->root;
401 u64 blocksize = root->sectorsize;
403 u64 isize = i_size_read(inode);
405 struct page **pages = NULL;
406 unsigned long nr_pages;
407 unsigned long nr_pages_ret = 0;
408 unsigned long total_compressed = 0;
409 unsigned long total_in = 0;
410 unsigned long max_compressed = 128 * 1024;
411 unsigned long max_uncompressed = 128 * 1024;
414 int compress_type = root->fs_info->compress_type;
417 /* if this is a small write inside eof, kick off a defrag */
418 if ((end - start + 1) < 16 * 1024 &&
419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
420 btrfs_add_inode_defrag(NULL, inode);
422 actual_end = min_t(u64, isize, end + 1);
425 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
426 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
429 * we don't want to send crud past the end of i_size through
430 * compression, that's just a waste of CPU time. So, if the
431 * end of the file is before the start of our current
432 * requested range of bytes, we bail out to the uncompressed
433 * cleanup code that can deal with all of this.
435 * It isn't really the fastest way to fix things, but this is a
436 * very uncommon corner.
438 if (actual_end <= start)
439 goto cleanup_and_bail_uncompressed;
441 total_compressed = actual_end - start;
444 * skip compression for a small file range(<=blocksize) that
445 * isn't an inline extent, since it dosen't save disk space at all.
447 if (total_compressed <= blocksize &&
448 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
449 goto cleanup_and_bail_uncompressed;
451 /* we want to make sure that amount of ram required to uncompress
452 * an extent is reasonable, so we limit the total size in ram
453 * of a compressed extent to 128k. This is a crucial number
454 * because it also controls how easily we can spread reads across
455 * cpus for decompression.
457 * We also want to make sure the amount of IO required to do
458 * a random read is reasonably small, so we limit the size of
459 * a compressed extent to 128k.
461 total_compressed = min(total_compressed, max_uncompressed);
462 num_bytes = ALIGN(end - start + 1, blocksize);
463 num_bytes = max(blocksize, num_bytes);
468 * we do compression for mount -o compress and when the
469 * inode has not been flagged as nocompress. This flag can
470 * change at any time if we discover bad compression ratios.
472 if (inode_need_compress(inode)) {
474 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
476 /* just bail out to the uncompressed code */
480 if (BTRFS_I(inode)->force_compress)
481 compress_type = BTRFS_I(inode)->force_compress;
484 * we need to call clear_page_dirty_for_io on each
485 * page in the range. Otherwise applications with the file
486 * mmap'd can wander in and change the page contents while
487 * we are compressing them.
489 * If the compression fails for any reason, we set the pages
490 * dirty again later on.
492 extent_range_clear_dirty_for_io(inode, start, end);
494 ret = btrfs_compress_pages(compress_type,
495 inode->i_mapping, start,
496 total_compressed, pages,
497 nr_pages, &nr_pages_ret,
503 unsigned long offset = total_compressed &
504 (PAGE_CACHE_SIZE - 1);
505 struct page *page = pages[nr_pages_ret - 1];
508 /* zero the tail end of the last page, we might be
509 * sending it down to disk
512 kaddr = kmap_atomic(page);
513 memset(kaddr + offset, 0,
514 PAGE_CACHE_SIZE - offset);
515 kunmap_atomic(kaddr);
522 /* lets try to make an inline extent */
523 if (ret || total_in < (actual_end - start)) {
524 /* we didn't compress the entire range, try
525 * to make an uncompressed inline extent.
527 ret = cow_file_range_inline(root, inode, start, end,
530 /* try making a compressed inline extent */
531 ret = cow_file_range_inline(root, inode, start, end,
533 compress_type, pages);
536 unsigned long clear_flags = EXTENT_DELALLOC |
538 unsigned long page_error_op;
540 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
541 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
544 * inline extent creation worked or returned error,
545 * we don't need to create any more async work items.
546 * Unlock and free up our temp pages.
548 extent_clear_unlock_delalloc(inode, start, end, NULL,
549 clear_flags, PAGE_UNLOCK |
560 * we aren't doing an inline extent round the compressed size
561 * up to a block size boundary so the allocator does sane
564 total_compressed = ALIGN(total_compressed, blocksize);
567 * one last check to make sure the compression is really a
568 * win, compare the page count read with the blocks on disk
570 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
571 if (total_compressed >= total_in) {
574 num_bytes = total_in;
577 if (!will_compress && pages) {
579 * the compression code ran but failed to make things smaller,
580 * free any pages it allocated and our page pointer array
582 for (i = 0; i < nr_pages_ret; i++) {
583 WARN_ON(pages[i]->mapping);
584 page_cache_release(pages[i]);
588 total_compressed = 0;
591 /* flag the file so we don't compress in the future */
592 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
593 !(BTRFS_I(inode)->force_compress)) {
594 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
600 /* the async work queues will take care of doing actual
601 * allocation on disk for these compressed pages,
602 * and will submit them to the elevator.
604 add_async_extent(async_cow, start, num_bytes,
605 total_compressed, pages, nr_pages_ret,
608 if (start + num_bytes < end) {
615 cleanup_and_bail_uncompressed:
617 * No compression, but we still need to write the pages in
618 * the file we've been given so far. redirty the locked
619 * page if it corresponds to our extent and set things up
620 * for the async work queue to run cow_file_range to do
621 * the normal delalloc dance
623 if (page_offset(locked_page) >= start &&
624 page_offset(locked_page) <= end) {
625 __set_page_dirty_nobuffers(locked_page);
626 /* unlocked later on in the async handlers */
629 extent_range_redirty_for_io(inode, start, end);
630 add_async_extent(async_cow, start, end - start + 1,
631 0, NULL, 0, BTRFS_COMPRESS_NONE);
638 for (i = 0; i < nr_pages_ret; i++) {
639 WARN_ON(pages[i]->mapping);
640 page_cache_release(pages[i]);
645 static void free_async_extent_pages(struct async_extent *async_extent)
649 if (!async_extent->pages)
652 for (i = 0; i < async_extent->nr_pages; i++) {
653 WARN_ON(async_extent->pages[i]->mapping);
654 page_cache_release(async_extent->pages[i]);
656 kfree(async_extent->pages);
657 async_extent->nr_pages = 0;
658 async_extent->pages = NULL;
662 * phase two of compressed writeback. This is the ordered portion
663 * of the code, which only gets called in the order the work was
664 * queued. We walk all the async extents created by compress_file_range
665 * and send them down to the disk.
667 static noinline void submit_compressed_extents(struct inode *inode,
668 struct async_cow *async_cow)
670 struct async_extent *async_extent;
672 struct btrfs_key ins;
673 struct extent_map *em;
674 struct btrfs_root *root = BTRFS_I(inode)->root;
675 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
676 struct extent_io_tree *io_tree;
680 while (!list_empty(&async_cow->extents)) {
681 async_extent = list_entry(async_cow->extents.next,
682 struct async_extent, list);
683 list_del(&async_extent->list);
685 io_tree = &BTRFS_I(inode)->io_tree;
688 /* did the compression code fall back to uncompressed IO? */
689 if (!async_extent->pages) {
690 int page_started = 0;
691 unsigned long nr_written = 0;
693 lock_extent(io_tree, async_extent->start,
694 async_extent->start +
695 async_extent->ram_size - 1);
697 /* allocate blocks */
698 ret = cow_file_range(inode, async_cow->locked_page,
700 async_extent->start +
701 async_extent->ram_size - 1,
702 &page_started, &nr_written, 0);
707 * if page_started, cow_file_range inserted an
708 * inline extent and took care of all the unlocking
709 * and IO for us. Otherwise, we need to submit
710 * all those pages down to the drive.
712 if (!page_started && !ret)
713 extent_write_locked_range(io_tree,
714 inode, async_extent->start,
715 async_extent->start +
716 async_extent->ram_size - 1,
720 unlock_page(async_cow->locked_page);
726 lock_extent(io_tree, async_extent->start,
727 async_extent->start + async_extent->ram_size - 1);
729 ret = btrfs_reserve_extent(root,
730 async_extent->compressed_size,
731 async_extent->compressed_size,
732 0, alloc_hint, &ins, 1, 1);
734 free_async_extent_pages(async_extent);
736 if (ret == -ENOSPC) {
737 unlock_extent(io_tree, async_extent->start,
738 async_extent->start +
739 async_extent->ram_size - 1);
742 * we need to redirty the pages if we decide to
743 * fallback to uncompressed IO, otherwise we
744 * will not submit these pages down to lower
747 extent_range_redirty_for_io(inode,
749 async_extent->start +
750 async_extent->ram_size - 1);
757 * here we're doing allocation and writeback of the
760 btrfs_drop_extent_cache(inode, async_extent->start,
761 async_extent->start +
762 async_extent->ram_size - 1, 0);
764 em = alloc_extent_map();
767 goto out_free_reserve;
769 em->start = async_extent->start;
770 em->len = async_extent->ram_size;
771 em->orig_start = em->start;
772 em->mod_start = em->start;
773 em->mod_len = em->len;
775 em->block_start = ins.objectid;
776 em->block_len = ins.offset;
777 em->orig_block_len = ins.offset;
778 em->ram_bytes = async_extent->ram_size;
779 em->bdev = root->fs_info->fs_devices->latest_bdev;
780 em->compress_type = async_extent->compress_type;
781 set_bit(EXTENT_FLAG_PINNED, &em->flags);
782 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
786 write_lock(&em_tree->lock);
787 ret = add_extent_mapping(em_tree, em, 1);
788 write_unlock(&em_tree->lock);
789 if (ret != -EEXIST) {
793 btrfs_drop_extent_cache(inode, async_extent->start,
794 async_extent->start +
795 async_extent->ram_size - 1, 0);
799 goto out_free_reserve;
801 ret = btrfs_add_ordered_extent_compress(inode,
804 async_extent->ram_size,
806 BTRFS_ORDERED_COMPRESSED,
807 async_extent->compress_type);
809 btrfs_drop_extent_cache(inode, async_extent->start,
810 async_extent->start +
811 async_extent->ram_size - 1, 0);
812 goto out_free_reserve;
816 * clear dirty, set writeback and unlock the pages.
818 extent_clear_unlock_delalloc(inode, async_extent->start,
819 async_extent->start +
820 async_extent->ram_size - 1,
821 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
822 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
824 ret = btrfs_submit_compressed_write(inode,
826 async_extent->ram_size,
828 ins.offset, async_extent->pages,
829 async_extent->nr_pages);
831 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
832 struct page *p = async_extent->pages[0];
833 const u64 start = async_extent->start;
834 const u64 end = start + async_extent->ram_size - 1;
836 p->mapping = inode->i_mapping;
837 tree->ops->writepage_end_io_hook(p, start, end,
840 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
843 free_async_extent_pages(async_extent);
845 alloc_hint = ins.objectid + ins.offset;
851 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
853 extent_clear_unlock_delalloc(inode, async_extent->start,
854 async_extent->start +
855 async_extent->ram_size - 1,
856 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
857 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
858 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
859 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
861 free_async_extent_pages(async_extent);
866 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
869 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
870 struct extent_map *em;
873 read_lock(&em_tree->lock);
874 em = search_extent_mapping(em_tree, start, num_bytes);
877 * if block start isn't an actual block number then find the
878 * first block in this inode and use that as a hint. If that
879 * block is also bogus then just don't worry about it.
881 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
883 em = search_extent_mapping(em_tree, 0, 0);
884 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
885 alloc_hint = em->block_start;
889 alloc_hint = em->block_start;
893 read_unlock(&em_tree->lock);
899 * when extent_io.c finds a delayed allocation range in the file,
900 * the call backs end up in this code. The basic idea is to
901 * allocate extents on disk for the range, and create ordered data structs
902 * in ram to track those extents.
904 * locked_page is the page that writepage had locked already. We use
905 * it to make sure we don't do extra locks or unlocks.
907 * *page_started is set to one if we unlock locked_page and do everything
908 * required to start IO on it. It may be clean and already done with
911 static noinline int cow_file_range(struct inode *inode,
912 struct page *locked_page,
913 u64 start, u64 end, int *page_started,
914 unsigned long *nr_written,
917 struct btrfs_root *root = BTRFS_I(inode)->root;
920 unsigned long ram_size;
923 u64 blocksize = root->sectorsize;
924 struct btrfs_key ins;
925 struct extent_map *em;
926 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
929 if (btrfs_is_free_space_inode(inode)) {
935 num_bytes = ALIGN(end - start + 1, blocksize);
936 num_bytes = max(blocksize, num_bytes);
937 disk_num_bytes = num_bytes;
939 /* if this is a small write inside eof, kick off defrag */
940 if (num_bytes < 64 * 1024 &&
941 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
942 btrfs_add_inode_defrag(NULL, inode);
945 /* lets try to make an inline extent */
946 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
949 extent_clear_unlock_delalloc(inode, start, end, NULL,
950 EXTENT_LOCKED | EXTENT_DELALLOC |
951 EXTENT_DEFRAG, PAGE_UNLOCK |
952 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
955 *nr_written = *nr_written +
956 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
959 } else if (ret < 0) {
964 BUG_ON(disk_num_bytes >
965 btrfs_super_total_bytes(root->fs_info->super_copy));
967 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
968 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
970 while (disk_num_bytes > 0) {
973 cur_alloc_size = disk_num_bytes;
974 ret = btrfs_reserve_extent(root, cur_alloc_size,
975 root->sectorsize, 0, alloc_hint,
980 em = alloc_extent_map();
986 em->orig_start = em->start;
987 ram_size = ins.offset;
988 em->len = ins.offset;
989 em->mod_start = em->start;
990 em->mod_len = em->len;
992 em->block_start = ins.objectid;
993 em->block_len = ins.offset;
994 em->orig_block_len = ins.offset;
995 em->ram_bytes = ram_size;
996 em->bdev = root->fs_info->fs_devices->latest_bdev;
997 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1001 write_lock(&em_tree->lock);
1002 ret = add_extent_mapping(em_tree, em, 1);
1003 write_unlock(&em_tree->lock);
1004 if (ret != -EEXIST) {
1005 free_extent_map(em);
1008 btrfs_drop_extent_cache(inode, start,
1009 start + ram_size - 1, 0);
1014 cur_alloc_size = ins.offset;
1015 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1016 ram_size, cur_alloc_size, 0);
1018 goto out_drop_extent_cache;
1020 if (root->root_key.objectid ==
1021 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1022 ret = btrfs_reloc_clone_csums(inode, start,
1025 goto out_drop_extent_cache;
1028 if (disk_num_bytes < cur_alloc_size)
1031 /* we're not doing compressed IO, don't unlock the first
1032 * page (which the caller expects to stay locked), don't
1033 * clear any dirty bits and don't set any writeback bits
1035 * Do set the Private2 bit so we know this page was properly
1036 * setup for writepage
1038 op = unlock ? PAGE_UNLOCK : 0;
1039 op |= PAGE_SET_PRIVATE2;
1041 extent_clear_unlock_delalloc(inode, start,
1042 start + ram_size - 1, locked_page,
1043 EXTENT_LOCKED | EXTENT_DELALLOC,
1045 disk_num_bytes -= cur_alloc_size;
1046 num_bytes -= cur_alloc_size;
1047 alloc_hint = ins.objectid + ins.offset;
1048 start += cur_alloc_size;
1053 out_drop_extent_cache:
1054 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1056 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1058 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1059 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1060 EXTENT_DELALLOC | EXTENT_DEFRAG,
1061 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1062 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1067 * work queue call back to started compression on a file and pages
1069 static noinline void async_cow_start(struct btrfs_work *work)
1071 struct async_cow *async_cow;
1073 async_cow = container_of(work, struct async_cow, work);
1075 compress_file_range(async_cow->inode, async_cow->locked_page,
1076 async_cow->start, async_cow->end, async_cow,
1078 if (num_added == 0) {
1079 btrfs_add_delayed_iput(async_cow->inode);
1080 async_cow->inode = NULL;
1085 * work queue call back to submit previously compressed pages
1087 static noinline void async_cow_submit(struct btrfs_work *work)
1089 struct async_cow *async_cow;
1090 struct btrfs_root *root;
1091 unsigned long nr_pages;
1093 async_cow = container_of(work, struct async_cow, work);
1095 root = async_cow->root;
1096 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1099 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1101 waitqueue_active(&root->fs_info->async_submit_wait))
1102 wake_up(&root->fs_info->async_submit_wait);
1104 if (async_cow->inode)
1105 submit_compressed_extents(async_cow->inode, async_cow);
1108 static noinline void async_cow_free(struct btrfs_work *work)
1110 struct async_cow *async_cow;
1111 async_cow = container_of(work, struct async_cow, work);
1112 if (async_cow->inode)
1113 btrfs_add_delayed_iput(async_cow->inode);
1117 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1118 u64 start, u64 end, int *page_started,
1119 unsigned long *nr_written)
1121 struct async_cow *async_cow;
1122 struct btrfs_root *root = BTRFS_I(inode)->root;
1123 unsigned long nr_pages;
1125 int limit = 10 * 1024 * 1024;
1127 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1128 1, 0, NULL, GFP_NOFS);
1129 while (start < end) {
1130 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1131 BUG_ON(!async_cow); /* -ENOMEM */
1132 async_cow->inode = igrab(inode);
1133 async_cow->root = root;
1134 async_cow->locked_page = locked_page;
1135 async_cow->start = start;
1137 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1138 !btrfs_test_opt(root, FORCE_COMPRESS))
1141 cur_end = min(end, start + 512 * 1024 - 1);
1143 async_cow->end = cur_end;
1144 INIT_LIST_HEAD(&async_cow->extents);
1146 btrfs_init_work(&async_cow->work,
1147 btrfs_delalloc_helper,
1148 async_cow_start, async_cow_submit,
1151 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1153 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1155 btrfs_queue_work(root->fs_info->delalloc_workers,
1158 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1159 wait_event(root->fs_info->async_submit_wait,
1160 (atomic_read(&root->fs_info->async_delalloc_pages) <
1164 while (atomic_read(&root->fs_info->async_submit_draining) &&
1165 atomic_read(&root->fs_info->async_delalloc_pages)) {
1166 wait_event(root->fs_info->async_submit_wait,
1167 (atomic_read(&root->fs_info->async_delalloc_pages) ==
1171 *nr_written += nr_pages;
1172 start = cur_end + 1;
1178 static noinline int csum_exist_in_range(struct btrfs_root *root,
1179 u64 bytenr, u64 num_bytes)
1182 struct btrfs_ordered_sum *sums;
1185 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1186 bytenr + num_bytes - 1, &list, 0);
1187 if (ret == 0 && list_empty(&list))
1190 while (!list_empty(&list)) {
1191 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1192 list_del(&sums->list);
1199 * when nowcow writeback call back. This checks for snapshots or COW copies
1200 * of the extents that exist in the file, and COWs the file as required.
1202 * If no cow copies or snapshots exist, we write directly to the existing
1205 static noinline int run_delalloc_nocow(struct inode *inode,
1206 struct page *locked_page,
1207 u64 start, u64 end, int *page_started, int force,
1208 unsigned long *nr_written)
1210 struct btrfs_root *root = BTRFS_I(inode)->root;
1211 struct btrfs_trans_handle *trans;
1212 struct extent_buffer *leaf;
1213 struct btrfs_path *path;
1214 struct btrfs_file_extent_item *fi;
1215 struct btrfs_key found_key;
1230 u64 ino = btrfs_ino(inode);
1232 path = btrfs_alloc_path();
1234 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1235 EXTENT_LOCKED | EXTENT_DELALLOC |
1236 EXTENT_DO_ACCOUNTING |
1237 EXTENT_DEFRAG, PAGE_UNLOCK |
1239 PAGE_SET_WRITEBACK |
1240 PAGE_END_WRITEBACK);
1244 nolock = btrfs_is_free_space_inode(inode);
1247 trans = btrfs_join_transaction_nolock(root);
1249 trans = btrfs_join_transaction(root);
1251 if (IS_ERR(trans)) {
1252 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1253 EXTENT_LOCKED | EXTENT_DELALLOC |
1254 EXTENT_DO_ACCOUNTING |
1255 EXTENT_DEFRAG, PAGE_UNLOCK |
1257 PAGE_SET_WRITEBACK |
1258 PAGE_END_WRITEBACK);
1259 btrfs_free_path(path);
1260 return PTR_ERR(trans);
1263 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1265 cow_start = (u64)-1;
1268 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1272 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1273 leaf = path->nodes[0];
1274 btrfs_item_key_to_cpu(leaf, &found_key,
1275 path->slots[0] - 1);
1276 if (found_key.objectid == ino &&
1277 found_key.type == BTRFS_EXTENT_DATA_KEY)
1282 leaf = path->nodes[0];
1283 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1284 ret = btrfs_next_leaf(root, path);
1289 leaf = path->nodes[0];
1295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1297 if (found_key.objectid > ino ||
1298 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1299 found_key.offset > end)
1302 if (found_key.offset > cur_offset) {
1303 extent_end = found_key.offset;
1308 fi = btrfs_item_ptr(leaf, path->slots[0],
1309 struct btrfs_file_extent_item);
1310 extent_type = btrfs_file_extent_type(leaf, fi);
1312 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1313 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1314 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1315 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1316 extent_offset = btrfs_file_extent_offset(leaf, fi);
1317 extent_end = found_key.offset +
1318 btrfs_file_extent_num_bytes(leaf, fi);
1320 btrfs_file_extent_disk_num_bytes(leaf, fi);
1321 if (extent_end <= start) {
1325 if (disk_bytenr == 0)
1327 if (btrfs_file_extent_compression(leaf, fi) ||
1328 btrfs_file_extent_encryption(leaf, fi) ||
1329 btrfs_file_extent_other_encoding(leaf, fi))
1331 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1333 if (btrfs_extent_readonly(root, disk_bytenr))
1335 if (btrfs_cross_ref_exist(trans, root, ino,
1337 extent_offset, disk_bytenr))
1339 disk_bytenr += extent_offset;
1340 disk_bytenr += cur_offset - found_key.offset;
1341 num_bytes = min(end + 1, extent_end) - cur_offset;
1343 * if there are pending snapshots for this root,
1344 * we fall into common COW way.
1347 err = btrfs_start_write_no_snapshoting(root);
1352 * force cow if csum exists in the range.
1353 * this ensure that csum for a given extent are
1354 * either valid or do not exist.
1356 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1359 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1360 extent_end = found_key.offset +
1361 btrfs_file_extent_inline_len(leaf,
1362 path->slots[0], fi);
1363 extent_end = ALIGN(extent_end, root->sectorsize);
1368 if (extent_end <= start) {
1370 if (!nolock && nocow)
1371 btrfs_end_write_no_snapshoting(root);
1375 if (cow_start == (u64)-1)
1376 cow_start = cur_offset;
1377 cur_offset = extent_end;
1378 if (cur_offset > end)
1384 btrfs_release_path(path);
1385 if (cow_start != (u64)-1) {
1386 ret = cow_file_range(inode, locked_page,
1387 cow_start, found_key.offset - 1,
1388 page_started, nr_written, 1);
1390 if (!nolock && nocow)
1391 btrfs_end_write_no_snapshoting(root);
1394 cow_start = (u64)-1;
1397 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1398 struct extent_map *em;
1399 struct extent_map_tree *em_tree;
1400 em_tree = &BTRFS_I(inode)->extent_tree;
1401 em = alloc_extent_map();
1402 BUG_ON(!em); /* -ENOMEM */
1403 em->start = cur_offset;
1404 em->orig_start = found_key.offset - extent_offset;
1405 em->len = num_bytes;
1406 em->block_len = num_bytes;
1407 em->block_start = disk_bytenr;
1408 em->orig_block_len = disk_num_bytes;
1409 em->ram_bytes = ram_bytes;
1410 em->bdev = root->fs_info->fs_devices->latest_bdev;
1411 em->mod_start = em->start;
1412 em->mod_len = em->len;
1413 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1414 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1415 em->generation = -1;
1417 write_lock(&em_tree->lock);
1418 ret = add_extent_mapping(em_tree, em, 1);
1419 write_unlock(&em_tree->lock);
1420 if (ret != -EEXIST) {
1421 free_extent_map(em);
1424 btrfs_drop_extent_cache(inode, em->start,
1425 em->start + em->len - 1, 0);
1427 type = BTRFS_ORDERED_PREALLOC;
1429 type = BTRFS_ORDERED_NOCOW;
1432 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1433 num_bytes, num_bytes, type);
1434 BUG_ON(ret); /* -ENOMEM */
1436 if (root->root_key.objectid ==
1437 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1438 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1441 if (!nolock && nocow)
1442 btrfs_end_write_no_snapshoting(root);
1447 extent_clear_unlock_delalloc(inode, cur_offset,
1448 cur_offset + num_bytes - 1,
1449 locked_page, EXTENT_LOCKED |
1450 EXTENT_DELALLOC, PAGE_UNLOCK |
1452 if (!nolock && nocow)
1453 btrfs_end_write_no_snapshoting(root);
1454 cur_offset = extent_end;
1455 if (cur_offset > end)
1458 btrfs_release_path(path);
1460 if (cur_offset <= end && cow_start == (u64)-1) {
1461 cow_start = cur_offset;
1465 if (cow_start != (u64)-1) {
1466 ret = cow_file_range(inode, locked_page, cow_start, end,
1467 page_started, nr_written, 1);
1473 err = btrfs_end_transaction(trans, root);
1477 if (ret && cur_offset < end)
1478 extent_clear_unlock_delalloc(inode, cur_offset, end,
1479 locked_page, EXTENT_LOCKED |
1480 EXTENT_DELALLOC | EXTENT_DEFRAG |
1481 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1483 PAGE_SET_WRITEBACK |
1484 PAGE_END_WRITEBACK);
1485 btrfs_free_path(path);
1489 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1492 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1493 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1497 * @defrag_bytes is a hint value, no spinlock held here,
1498 * if is not zero, it means the file is defragging.
1499 * Force cow if given extent needs to be defragged.
1501 if (BTRFS_I(inode)->defrag_bytes &&
1502 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1503 EXTENT_DEFRAG, 0, NULL))
1510 * extent_io.c call back to do delayed allocation processing
1512 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1513 u64 start, u64 end, int *page_started,
1514 unsigned long *nr_written)
1517 int force_cow = need_force_cow(inode, start, end);
1519 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1520 ret = run_delalloc_nocow(inode, locked_page, start, end,
1521 page_started, 1, nr_written);
1522 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1523 ret = run_delalloc_nocow(inode, locked_page, start, end,
1524 page_started, 0, nr_written);
1525 } else if (!inode_need_compress(inode)) {
1526 ret = cow_file_range(inode, locked_page, start, end,
1527 page_started, nr_written, 1);
1529 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1530 &BTRFS_I(inode)->runtime_flags);
1531 ret = cow_file_range_async(inode, locked_page, start, end,
1532 page_started, nr_written);
1537 static void btrfs_split_extent_hook(struct inode *inode,
1538 struct extent_state *orig, u64 split)
1542 /* not delalloc, ignore it */
1543 if (!(orig->state & EXTENT_DELALLOC))
1546 size = orig->end - orig->start + 1;
1547 if (size > BTRFS_MAX_EXTENT_SIZE) {
1552 * See the explanation in btrfs_merge_extent_hook, the same
1553 * applies here, just in reverse.
1555 new_size = orig->end - split + 1;
1556 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1557 BTRFS_MAX_EXTENT_SIZE);
1558 new_size = split - orig->start;
1559 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1560 BTRFS_MAX_EXTENT_SIZE);
1561 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1562 BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1566 spin_lock(&BTRFS_I(inode)->lock);
1567 BTRFS_I(inode)->outstanding_extents++;
1568 spin_unlock(&BTRFS_I(inode)->lock);
1572 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1573 * extents so we can keep track of new extents that are just merged onto old
1574 * extents, such as when we are doing sequential writes, so we can properly
1575 * account for the metadata space we'll need.
1577 static void btrfs_merge_extent_hook(struct inode *inode,
1578 struct extent_state *new,
1579 struct extent_state *other)
1581 u64 new_size, old_size;
1584 /* not delalloc, ignore it */
1585 if (!(other->state & EXTENT_DELALLOC))
1588 if (new->start > other->start)
1589 new_size = new->end - other->start + 1;
1591 new_size = other->end - new->start + 1;
1593 /* we're not bigger than the max, unreserve the space and go */
1594 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1595 spin_lock(&BTRFS_I(inode)->lock);
1596 BTRFS_I(inode)->outstanding_extents--;
1597 spin_unlock(&BTRFS_I(inode)->lock);
1602 * We have to add up either side to figure out how many extents were
1603 * accounted for before we merged into one big extent. If the number of
1604 * extents we accounted for is <= the amount we need for the new range
1605 * then we can return, otherwise drop. Think of it like this
1609 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1610 * need 2 outstanding extents, on one side we have 1 and the other side
1611 * we have 1 so they are == and we can return. But in this case
1613 * [MAX_SIZE+4k][MAX_SIZE+4k]
1615 * Each range on their own accounts for 2 extents, but merged together
1616 * they are only 3 extents worth of accounting, so we need to drop in
1619 old_size = other->end - other->start + 1;
1620 num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1621 BTRFS_MAX_EXTENT_SIZE);
1622 old_size = new->end - new->start + 1;
1623 num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1624 BTRFS_MAX_EXTENT_SIZE);
1626 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1627 BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1630 spin_lock(&BTRFS_I(inode)->lock);
1631 BTRFS_I(inode)->outstanding_extents--;
1632 spin_unlock(&BTRFS_I(inode)->lock);
1635 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1636 struct inode *inode)
1638 spin_lock(&root->delalloc_lock);
1639 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1640 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1641 &root->delalloc_inodes);
1642 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1643 &BTRFS_I(inode)->runtime_flags);
1644 root->nr_delalloc_inodes++;
1645 if (root->nr_delalloc_inodes == 1) {
1646 spin_lock(&root->fs_info->delalloc_root_lock);
1647 BUG_ON(!list_empty(&root->delalloc_root));
1648 list_add_tail(&root->delalloc_root,
1649 &root->fs_info->delalloc_roots);
1650 spin_unlock(&root->fs_info->delalloc_root_lock);
1653 spin_unlock(&root->delalloc_lock);
1656 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1657 struct inode *inode)
1659 spin_lock(&root->delalloc_lock);
1660 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1661 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1662 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1663 &BTRFS_I(inode)->runtime_flags);
1664 root->nr_delalloc_inodes--;
1665 if (!root->nr_delalloc_inodes) {
1666 spin_lock(&root->fs_info->delalloc_root_lock);
1667 BUG_ON(list_empty(&root->delalloc_root));
1668 list_del_init(&root->delalloc_root);
1669 spin_unlock(&root->fs_info->delalloc_root_lock);
1672 spin_unlock(&root->delalloc_lock);
1676 * extent_io.c set_bit_hook, used to track delayed allocation
1677 * bytes in this file, and to maintain the list of inodes that
1678 * have pending delalloc work to be done.
1680 static void btrfs_set_bit_hook(struct inode *inode,
1681 struct extent_state *state, unsigned *bits)
1684 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1687 * set_bit and clear bit hooks normally require _irqsave/restore
1688 * but in this case, we are only testing for the DELALLOC
1689 * bit, which is only set or cleared with irqs on
1691 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1692 struct btrfs_root *root = BTRFS_I(inode)->root;
1693 u64 len = state->end + 1 - state->start;
1694 bool do_list = !btrfs_is_free_space_inode(inode);
1696 if (*bits & EXTENT_FIRST_DELALLOC) {
1697 *bits &= ~EXTENT_FIRST_DELALLOC;
1699 spin_lock(&BTRFS_I(inode)->lock);
1700 BTRFS_I(inode)->outstanding_extents++;
1701 spin_unlock(&BTRFS_I(inode)->lock);
1704 /* For sanity tests */
1705 if (btrfs_test_is_dummy_root(root))
1708 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1709 root->fs_info->delalloc_batch);
1710 spin_lock(&BTRFS_I(inode)->lock);
1711 BTRFS_I(inode)->delalloc_bytes += len;
1712 if (*bits & EXTENT_DEFRAG)
1713 BTRFS_I(inode)->defrag_bytes += len;
1714 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1715 &BTRFS_I(inode)->runtime_flags))
1716 btrfs_add_delalloc_inodes(root, inode);
1717 spin_unlock(&BTRFS_I(inode)->lock);
1722 * extent_io.c clear_bit_hook, see set_bit_hook for why
1724 static void btrfs_clear_bit_hook(struct inode *inode,
1725 struct extent_state *state,
1728 u64 len = state->end + 1 - state->start;
1729 u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1730 BTRFS_MAX_EXTENT_SIZE);
1732 spin_lock(&BTRFS_I(inode)->lock);
1733 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1734 BTRFS_I(inode)->defrag_bytes -= len;
1735 spin_unlock(&BTRFS_I(inode)->lock);
1738 * set_bit and clear bit hooks normally require _irqsave/restore
1739 * but in this case, we are only testing for the DELALLOC
1740 * bit, which is only set or cleared with irqs on
1742 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1743 struct btrfs_root *root = BTRFS_I(inode)->root;
1744 bool do_list = !btrfs_is_free_space_inode(inode);
1746 if (*bits & EXTENT_FIRST_DELALLOC) {
1747 *bits &= ~EXTENT_FIRST_DELALLOC;
1748 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1749 spin_lock(&BTRFS_I(inode)->lock);
1750 BTRFS_I(inode)->outstanding_extents -= num_extents;
1751 spin_unlock(&BTRFS_I(inode)->lock);
1755 * We don't reserve metadata space for space cache inodes so we
1756 * don't need to call dellalloc_release_metadata if there is an
1759 if (*bits & EXTENT_DO_ACCOUNTING &&
1760 root != root->fs_info->tree_root)
1761 btrfs_delalloc_release_metadata(inode, len);
1763 /* For sanity tests. */
1764 if (btrfs_test_is_dummy_root(root))
1767 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1768 && do_list && !(state->state & EXTENT_NORESERVE))
1769 btrfs_free_reserved_data_space(inode, len);
1771 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1772 root->fs_info->delalloc_batch);
1773 spin_lock(&BTRFS_I(inode)->lock);
1774 BTRFS_I(inode)->delalloc_bytes -= len;
1775 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1776 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1777 &BTRFS_I(inode)->runtime_flags))
1778 btrfs_del_delalloc_inode(root, inode);
1779 spin_unlock(&BTRFS_I(inode)->lock);
1784 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1785 * we don't create bios that span stripes or chunks
1787 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1788 size_t size, struct bio *bio,
1789 unsigned long bio_flags)
1791 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1792 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1797 if (bio_flags & EXTENT_BIO_COMPRESSED)
1800 length = bio->bi_iter.bi_size;
1801 map_length = length;
1802 ret = btrfs_map_block(root->fs_info, rw, logical,
1803 &map_length, NULL, 0);
1804 /* Will always return 0 with map_multi == NULL */
1806 if (map_length < length + size)
1812 * in order to insert checksums into the metadata in large chunks,
1813 * we wait until bio submission time. All the pages in the bio are
1814 * checksummed and sums are attached onto the ordered extent record.
1816 * At IO completion time the cums attached on the ordered extent record
1817 * are inserted into the btree
1819 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1820 struct bio *bio, int mirror_num,
1821 unsigned long bio_flags,
1824 struct btrfs_root *root = BTRFS_I(inode)->root;
1827 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1828 BUG_ON(ret); /* -ENOMEM */
1833 * in order to insert checksums into the metadata in large chunks,
1834 * we wait until bio submission time. All the pages in the bio are
1835 * checksummed and sums are attached onto the ordered extent record.
1837 * At IO completion time the cums attached on the ordered extent record
1838 * are inserted into the btree
1840 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1841 int mirror_num, unsigned long bio_flags,
1844 struct btrfs_root *root = BTRFS_I(inode)->root;
1847 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1849 bio->bi_error = ret;
1856 * extent_io.c submission hook. This does the right thing for csum calculation
1857 * on write, or reading the csums from the tree before a read
1859 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1860 int mirror_num, unsigned long bio_flags,
1863 struct btrfs_root *root = BTRFS_I(inode)->root;
1867 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1869 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1871 if (btrfs_is_free_space_inode(inode))
1874 if (!(rw & REQ_WRITE)) {
1875 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1879 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1880 ret = btrfs_submit_compressed_read(inode, bio,
1884 } else if (!skip_sum) {
1885 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1890 } else if (async && !skip_sum) {
1891 /* csum items have already been cloned */
1892 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1894 /* we're doing a write, do the async checksumming */
1895 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1896 inode, rw, bio, mirror_num,
1897 bio_flags, bio_offset,
1898 __btrfs_submit_bio_start,
1899 __btrfs_submit_bio_done);
1901 } else if (!skip_sum) {
1902 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1908 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1912 bio->bi_error = ret;
1919 * given a list of ordered sums record them in the inode. This happens
1920 * at IO completion time based on sums calculated at bio submission time.
1922 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1923 struct inode *inode, u64 file_offset,
1924 struct list_head *list)
1926 struct btrfs_ordered_sum *sum;
1928 list_for_each_entry(sum, list, list) {
1929 trans->adding_csums = 1;
1930 btrfs_csum_file_blocks(trans,
1931 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1932 trans->adding_csums = 0;
1937 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1938 struct extent_state **cached_state)
1940 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1941 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1942 cached_state, GFP_NOFS);
1945 /* see btrfs_writepage_start_hook for details on why this is required */
1946 struct btrfs_writepage_fixup {
1948 struct btrfs_work work;
1951 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1953 struct btrfs_writepage_fixup *fixup;
1954 struct btrfs_ordered_extent *ordered;
1955 struct extent_state *cached_state = NULL;
1957 struct inode *inode;
1962 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1966 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1967 ClearPageChecked(page);
1971 inode = page->mapping->host;
1972 page_start = page_offset(page);
1973 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1975 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1978 /* already ordered? We're done */
1979 if (PagePrivate2(page))
1982 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1984 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1985 page_end, &cached_state, GFP_NOFS);
1987 btrfs_start_ordered_extent(inode, ordered, 1);
1988 btrfs_put_ordered_extent(ordered);
1992 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1994 mapping_set_error(page->mapping, ret);
1995 end_extent_writepage(page, ret, page_start, page_end);
1996 ClearPageChecked(page);
2000 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
2001 ClearPageChecked(page);
2002 set_page_dirty(page);
2004 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2005 &cached_state, GFP_NOFS);
2008 page_cache_release(page);
2013 * There are a few paths in the higher layers of the kernel that directly
2014 * set the page dirty bit without asking the filesystem if it is a
2015 * good idea. This causes problems because we want to make sure COW
2016 * properly happens and the data=ordered rules are followed.
2018 * In our case any range that doesn't have the ORDERED bit set
2019 * hasn't been properly setup for IO. We kick off an async process
2020 * to fix it up. The async helper will wait for ordered extents, set
2021 * the delalloc bit and make it safe to write the page.
2023 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2025 struct inode *inode = page->mapping->host;
2026 struct btrfs_writepage_fixup *fixup;
2027 struct btrfs_root *root = BTRFS_I(inode)->root;
2029 /* this page is properly in the ordered list */
2030 if (TestClearPagePrivate2(page))
2033 if (PageChecked(page))
2036 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2040 SetPageChecked(page);
2041 page_cache_get(page);
2042 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2043 btrfs_writepage_fixup_worker, NULL, NULL);
2045 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2049 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2050 struct inode *inode, u64 file_pos,
2051 u64 disk_bytenr, u64 disk_num_bytes,
2052 u64 num_bytes, u64 ram_bytes,
2053 u8 compression, u8 encryption,
2054 u16 other_encoding, int extent_type)
2056 struct btrfs_root *root = BTRFS_I(inode)->root;
2057 struct btrfs_file_extent_item *fi;
2058 struct btrfs_path *path;
2059 struct extent_buffer *leaf;
2060 struct btrfs_key ins;
2061 int extent_inserted = 0;
2064 path = btrfs_alloc_path();
2069 * we may be replacing one extent in the tree with another.
2070 * The new extent is pinned in the extent map, and we don't want
2071 * to drop it from the cache until it is completely in the btree.
2073 * So, tell btrfs_drop_extents to leave this extent in the cache.
2074 * the caller is expected to unpin it and allow it to be merged
2077 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2078 file_pos + num_bytes, NULL, 0,
2079 1, sizeof(*fi), &extent_inserted);
2083 if (!extent_inserted) {
2084 ins.objectid = btrfs_ino(inode);
2085 ins.offset = file_pos;
2086 ins.type = BTRFS_EXTENT_DATA_KEY;
2088 path->leave_spinning = 1;
2089 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2094 leaf = path->nodes[0];
2095 fi = btrfs_item_ptr(leaf, path->slots[0],
2096 struct btrfs_file_extent_item);
2097 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2098 btrfs_set_file_extent_type(leaf, fi, extent_type);
2099 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2100 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2101 btrfs_set_file_extent_offset(leaf, fi, 0);
2102 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2103 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2104 btrfs_set_file_extent_compression(leaf, fi, compression);
2105 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2106 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2108 btrfs_mark_buffer_dirty(leaf);
2109 btrfs_release_path(path);
2111 inode_add_bytes(inode, num_bytes);
2113 ins.objectid = disk_bytenr;
2114 ins.offset = disk_num_bytes;
2115 ins.type = BTRFS_EXTENT_ITEM_KEY;
2116 ret = btrfs_alloc_reserved_file_extent(trans, root,
2117 root->root_key.objectid,
2118 btrfs_ino(inode), file_pos, &ins);
2120 btrfs_free_path(path);
2125 /* snapshot-aware defrag */
2126 struct sa_defrag_extent_backref {
2127 struct rb_node node;
2128 struct old_sa_defrag_extent *old;
2137 struct old_sa_defrag_extent {
2138 struct list_head list;
2139 struct new_sa_defrag_extent *new;
2148 struct new_sa_defrag_extent {
2149 struct rb_root root;
2150 struct list_head head;
2151 struct btrfs_path *path;
2152 struct inode *inode;
2160 static int backref_comp(struct sa_defrag_extent_backref *b1,
2161 struct sa_defrag_extent_backref *b2)
2163 if (b1->root_id < b2->root_id)
2165 else if (b1->root_id > b2->root_id)
2168 if (b1->inum < b2->inum)
2170 else if (b1->inum > b2->inum)
2173 if (b1->file_pos < b2->file_pos)
2175 else if (b1->file_pos > b2->file_pos)
2179 * [------------------------------] ===> (a range of space)
2180 * |<--->| |<---->| =============> (fs/file tree A)
2181 * |<---------------------------->| ===> (fs/file tree B)
2183 * A range of space can refer to two file extents in one tree while
2184 * refer to only one file extent in another tree.
2186 * So we may process a disk offset more than one time(two extents in A)
2187 * and locate at the same extent(one extent in B), then insert two same
2188 * backrefs(both refer to the extent in B).
2193 static void backref_insert(struct rb_root *root,
2194 struct sa_defrag_extent_backref *backref)
2196 struct rb_node **p = &root->rb_node;
2197 struct rb_node *parent = NULL;
2198 struct sa_defrag_extent_backref *entry;
2203 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2205 ret = backref_comp(backref, entry);
2209 p = &(*p)->rb_right;
2212 rb_link_node(&backref->node, parent, p);
2213 rb_insert_color(&backref->node, root);
2217 * Note the backref might has changed, and in this case we just return 0.
2219 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2222 struct btrfs_file_extent_item *extent;
2223 struct btrfs_fs_info *fs_info;
2224 struct old_sa_defrag_extent *old = ctx;
2225 struct new_sa_defrag_extent *new = old->new;
2226 struct btrfs_path *path = new->path;
2227 struct btrfs_key key;
2228 struct btrfs_root *root;
2229 struct sa_defrag_extent_backref *backref;
2230 struct extent_buffer *leaf;
2231 struct inode *inode = new->inode;
2237 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2238 inum == btrfs_ino(inode))
2241 key.objectid = root_id;
2242 key.type = BTRFS_ROOT_ITEM_KEY;
2243 key.offset = (u64)-1;
2245 fs_info = BTRFS_I(inode)->root->fs_info;
2246 root = btrfs_read_fs_root_no_name(fs_info, &key);
2248 if (PTR_ERR(root) == -ENOENT)
2251 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2252 inum, offset, root_id);
2253 return PTR_ERR(root);
2256 key.objectid = inum;
2257 key.type = BTRFS_EXTENT_DATA_KEY;
2258 if (offset > (u64)-1 << 32)
2261 key.offset = offset;
2263 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2264 if (WARN_ON(ret < 0))
2271 leaf = path->nodes[0];
2272 slot = path->slots[0];
2274 if (slot >= btrfs_header_nritems(leaf)) {
2275 ret = btrfs_next_leaf(root, path);
2278 } else if (ret > 0) {
2287 btrfs_item_key_to_cpu(leaf, &key, slot);
2289 if (key.objectid > inum)
2292 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2295 extent = btrfs_item_ptr(leaf, slot,
2296 struct btrfs_file_extent_item);
2298 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2302 * 'offset' refers to the exact key.offset,
2303 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2304 * (key.offset - extent_offset).
2306 if (key.offset != offset)
2309 extent_offset = btrfs_file_extent_offset(leaf, extent);
2310 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2312 if (extent_offset >= old->extent_offset + old->offset +
2313 old->len || extent_offset + num_bytes <=
2314 old->extent_offset + old->offset)
2319 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2325 backref->root_id = root_id;
2326 backref->inum = inum;
2327 backref->file_pos = offset;
2328 backref->num_bytes = num_bytes;
2329 backref->extent_offset = extent_offset;
2330 backref->generation = btrfs_file_extent_generation(leaf, extent);
2332 backref_insert(&new->root, backref);
2335 btrfs_release_path(path);
2340 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2341 struct new_sa_defrag_extent *new)
2343 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2344 struct old_sa_defrag_extent *old, *tmp;
2349 list_for_each_entry_safe(old, tmp, &new->head, list) {
2350 ret = iterate_inodes_from_logical(old->bytenr +
2351 old->extent_offset, fs_info,
2352 path, record_one_backref,
2354 if (ret < 0 && ret != -ENOENT)
2357 /* no backref to be processed for this extent */
2359 list_del(&old->list);
2364 if (list_empty(&new->head))
2370 static int relink_is_mergable(struct extent_buffer *leaf,
2371 struct btrfs_file_extent_item *fi,
2372 struct new_sa_defrag_extent *new)
2374 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2377 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2380 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2383 if (btrfs_file_extent_encryption(leaf, fi) ||
2384 btrfs_file_extent_other_encoding(leaf, fi))
2391 * Note the backref might has changed, and in this case we just return 0.
2393 static noinline int relink_extent_backref(struct btrfs_path *path,
2394 struct sa_defrag_extent_backref *prev,
2395 struct sa_defrag_extent_backref *backref)
2397 struct btrfs_file_extent_item *extent;
2398 struct btrfs_file_extent_item *item;
2399 struct btrfs_ordered_extent *ordered;
2400 struct btrfs_trans_handle *trans;
2401 struct btrfs_fs_info *fs_info;
2402 struct btrfs_root *root;
2403 struct btrfs_key key;
2404 struct extent_buffer *leaf;
2405 struct old_sa_defrag_extent *old = backref->old;
2406 struct new_sa_defrag_extent *new = old->new;
2407 struct inode *src_inode = new->inode;
2408 struct inode *inode;
2409 struct extent_state *cached = NULL;
2418 if (prev && prev->root_id == backref->root_id &&
2419 prev->inum == backref->inum &&
2420 prev->file_pos + prev->num_bytes == backref->file_pos)
2423 /* step 1: get root */
2424 key.objectid = backref->root_id;
2425 key.type = BTRFS_ROOT_ITEM_KEY;
2426 key.offset = (u64)-1;
2428 fs_info = BTRFS_I(src_inode)->root->fs_info;
2429 index = srcu_read_lock(&fs_info->subvol_srcu);
2431 root = btrfs_read_fs_root_no_name(fs_info, &key);
2433 srcu_read_unlock(&fs_info->subvol_srcu, index);
2434 if (PTR_ERR(root) == -ENOENT)
2436 return PTR_ERR(root);
2439 if (btrfs_root_readonly(root)) {
2440 srcu_read_unlock(&fs_info->subvol_srcu, index);
2444 /* step 2: get inode */
2445 key.objectid = backref->inum;
2446 key.type = BTRFS_INODE_ITEM_KEY;
2449 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2450 if (IS_ERR(inode)) {
2451 srcu_read_unlock(&fs_info->subvol_srcu, index);
2455 srcu_read_unlock(&fs_info->subvol_srcu, index);
2457 /* step 3: relink backref */
2458 lock_start = backref->file_pos;
2459 lock_end = backref->file_pos + backref->num_bytes - 1;
2460 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2463 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2465 btrfs_put_ordered_extent(ordered);
2469 trans = btrfs_join_transaction(root);
2470 if (IS_ERR(trans)) {
2471 ret = PTR_ERR(trans);
2475 key.objectid = backref->inum;
2476 key.type = BTRFS_EXTENT_DATA_KEY;
2477 key.offset = backref->file_pos;
2479 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2482 } else if (ret > 0) {
2487 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2488 struct btrfs_file_extent_item);
2490 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2491 backref->generation)
2494 btrfs_release_path(path);
2496 start = backref->file_pos;
2497 if (backref->extent_offset < old->extent_offset + old->offset)
2498 start += old->extent_offset + old->offset -
2499 backref->extent_offset;
2501 len = min(backref->extent_offset + backref->num_bytes,
2502 old->extent_offset + old->offset + old->len);
2503 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2505 ret = btrfs_drop_extents(trans, root, inode, start,
2510 key.objectid = btrfs_ino(inode);
2511 key.type = BTRFS_EXTENT_DATA_KEY;
2514 path->leave_spinning = 1;
2516 struct btrfs_file_extent_item *fi;
2518 struct btrfs_key found_key;
2520 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2525 leaf = path->nodes[0];
2526 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2528 fi = btrfs_item_ptr(leaf, path->slots[0],
2529 struct btrfs_file_extent_item);
2530 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2532 if (extent_len + found_key.offset == start &&
2533 relink_is_mergable(leaf, fi, new)) {
2534 btrfs_set_file_extent_num_bytes(leaf, fi,
2536 btrfs_mark_buffer_dirty(leaf);
2537 inode_add_bytes(inode, len);
2543 btrfs_release_path(path);
2548 ret = btrfs_insert_empty_item(trans, root, path, &key,
2551 btrfs_abort_transaction(trans, root, ret);
2555 leaf = path->nodes[0];
2556 item = btrfs_item_ptr(leaf, path->slots[0],
2557 struct btrfs_file_extent_item);
2558 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2559 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2560 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2561 btrfs_set_file_extent_num_bytes(leaf, item, len);
2562 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2563 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2564 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2565 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2566 btrfs_set_file_extent_encryption(leaf, item, 0);
2567 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2569 btrfs_mark_buffer_dirty(leaf);
2570 inode_add_bytes(inode, len);
2571 btrfs_release_path(path);
2573 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2575 backref->root_id, backref->inum,
2576 new->file_pos, 0); /* start - extent_offset */
2578 btrfs_abort_transaction(trans, root, ret);
2584 btrfs_release_path(path);
2585 path->leave_spinning = 0;
2586 btrfs_end_transaction(trans, root);
2588 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2594 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2596 struct old_sa_defrag_extent *old, *tmp;
2601 list_for_each_entry_safe(old, tmp, &new->head, list) {
2602 list_del(&old->list);
2608 static void relink_file_extents(struct new_sa_defrag_extent *new)
2610 struct btrfs_path *path;
2611 struct sa_defrag_extent_backref *backref;
2612 struct sa_defrag_extent_backref *prev = NULL;
2613 struct inode *inode;
2614 struct btrfs_root *root;
2615 struct rb_node *node;
2619 root = BTRFS_I(inode)->root;
2621 path = btrfs_alloc_path();
2625 if (!record_extent_backrefs(path, new)) {
2626 btrfs_free_path(path);
2629 btrfs_release_path(path);
2632 node = rb_first(&new->root);
2635 rb_erase(node, &new->root);
2637 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2639 ret = relink_extent_backref(path, prev, backref);
2652 btrfs_free_path(path);
2654 free_sa_defrag_extent(new);
2656 atomic_dec(&root->fs_info->defrag_running);
2657 wake_up(&root->fs_info->transaction_wait);
2660 static struct new_sa_defrag_extent *
2661 record_old_file_extents(struct inode *inode,
2662 struct btrfs_ordered_extent *ordered)
2664 struct btrfs_root *root = BTRFS_I(inode)->root;
2665 struct btrfs_path *path;
2666 struct btrfs_key key;
2667 struct old_sa_defrag_extent *old;
2668 struct new_sa_defrag_extent *new;
2671 new = kmalloc(sizeof(*new), GFP_NOFS);
2676 new->file_pos = ordered->file_offset;
2677 new->len = ordered->len;
2678 new->bytenr = ordered->start;
2679 new->disk_len = ordered->disk_len;
2680 new->compress_type = ordered->compress_type;
2681 new->root = RB_ROOT;
2682 INIT_LIST_HEAD(&new->head);
2684 path = btrfs_alloc_path();
2688 key.objectid = btrfs_ino(inode);
2689 key.type = BTRFS_EXTENT_DATA_KEY;
2690 key.offset = new->file_pos;
2692 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2695 if (ret > 0 && path->slots[0] > 0)
2698 /* find out all the old extents for the file range */
2700 struct btrfs_file_extent_item *extent;
2701 struct extent_buffer *l;
2710 slot = path->slots[0];
2712 if (slot >= btrfs_header_nritems(l)) {
2713 ret = btrfs_next_leaf(root, path);
2721 btrfs_item_key_to_cpu(l, &key, slot);
2723 if (key.objectid != btrfs_ino(inode))
2725 if (key.type != BTRFS_EXTENT_DATA_KEY)
2727 if (key.offset >= new->file_pos + new->len)
2730 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2732 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2733 if (key.offset + num_bytes < new->file_pos)
2736 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2740 extent_offset = btrfs_file_extent_offset(l, extent);
2742 old = kmalloc(sizeof(*old), GFP_NOFS);
2746 offset = max(new->file_pos, key.offset);
2747 end = min(new->file_pos + new->len, key.offset + num_bytes);
2749 old->bytenr = disk_bytenr;
2750 old->extent_offset = extent_offset;
2751 old->offset = offset - key.offset;
2752 old->len = end - offset;
2755 list_add_tail(&old->list, &new->head);
2761 btrfs_free_path(path);
2762 atomic_inc(&root->fs_info->defrag_running);
2767 btrfs_free_path(path);
2769 free_sa_defrag_extent(new);
2773 static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2776 struct btrfs_block_group_cache *cache;
2778 cache = btrfs_lookup_block_group(root->fs_info, start);
2781 spin_lock(&cache->lock);
2782 cache->delalloc_bytes -= len;
2783 spin_unlock(&cache->lock);
2785 btrfs_put_block_group(cache);
2788 /* as ordered data IO finishes, this gets called so we can finish
2789 * an ordered extent if the range of bytes in the file it covers are
2792 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2794 struct inode *inode = ordered_extent->inode;
2795 struct btrfs_root *root = BTRFS_I(inode)->root;
2796 struct btrfs_trans_handle *trans = NULL;
2797 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2798 struct extent_state *cached_state = NULL;
2799 struct new_sa_defrag_extent *new = NULL;
2800 int compress_type = 0;
2802 u64 logical_len = ordered_extent->len;
2804 bool truncated = false;
2806 nolock = btrfs_is_free_space_inode(inode);
2808 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2813 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2814 ordered_extent->file_offset +
2815 ordered_extent->len - 1);
2817 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2819 logical_len = ordered_extent->truncated_len;
2820 /* Truncated the entire extent, don't bother adding */
2825 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2826 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2827 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2829 trans = btrfs_join_transaction_nolock(root);
2831 trans = btrfs_join_transaction(root);
2832 if (IS_ERR(trans)) {
2833 ret = PTR_ERR(trans);
2837 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2838 ret = btrfs_update_inode_fallback(trans, root, inode);
2839 if (ret) /* -ENOMEM or corruption */
2840 btrfs_abort_transaction(trans, root, ret);
2844 lock_extent_bits(io_tree, ordered_extent->file_offset,
2845 ordered_extent->file_offset + ordered_extent->len - 1,
2848 ret = test_range_bit(io_tree, ordered_extent->file_offset,
2849 ordered_extent->file_offset + ordered_extent->len - 1,
2850 EXTENT_DEFRAG, 1, cached_state);
2852 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2853 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2854 /* the inode is shared */
2855 new = record_old_file_extents(inode, ordered_extent);
2857 clear_extent_bit(io_tree, ordered_extent->file_offset,
2858 ordered_extent->file_offset + ordered_extent->len - 1,
2859 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2863 trans = btrfs_join_transaction_nolock(root);
2865 trans = btrfs_join_transaction(root);
2866 if (IS_ERR(trans)) {
2867 ret = PTR_ERR(trans);
2872 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2874 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2875 compress_type = ordered_extent->compress_type;
2876 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2877 BUG_ON(compress_type);
2878 ret = btrfs_mark_extent_written(trans, inode,
2879 ordered_extent->file_offset,
2880 ordered_extent->file_offset +
2883 BUG_ON(root == root->fs_info->tree_root);
2884 ret = insert_reserved_file_extent(trans, inode,
2885 ordered_extent->file_offset,
2886 ordered_extent->start,
2887 ordered_extent->disk_len,
2888 logical_len, logical_len,
2889 compress_type, 0, 0,
2890 BTRFS_FILE_EXTENT_REG);
2892 btrfs_release_delalloc_bytes(root,
2893 ordered_extent->start,
2894 ordered_extent->disk_len);
2896 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2897 ordered_extent->file_offset, ordered_extent->len,
2900 btrfs_abort_transaction(trans, root, ret);
2904 add_pending_csums(trans, inode, ordered_extent->file_offset,
2905 &ordered_extent->list);
2907 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2908 ret = btrfs_update_inode_fallback(trans, root, inode);
2909 if (ret) { /* -ENOMEM or corruption */
2910 btrfs_abort_transaction(trans, root, ret);
2915 unlock_extent_cached(io_tree, ordered_extent->file_offset,
2916 ordered_extent->file_offset +
2917 ordered_extent->len - 1, &cached_state, GFP_NOFS);
2919 if (root != root->fs_info->tree_root)
2920 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2922 btrfs_end_transaction(trans, root);
2924 if (ret || truncated) {
2928 start = ordered_extent->file_offset + logical_len;
2930 start = ordered_extent->file_offset;
2931 end = ordered_extent->file_offset + ordered_extent->len - 1;
2932 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2934 /* Drop the cache for the part of the extent we didn't write. */
2935 btrfs_drop_extent_cache(inode, start, end, 0);
2938 * If the ordered extent had an IOERR or something else went
2939 * wrong we need to return the space for this ordered extent
2940 * back to the allocator. We only free the extent in the
2941 * truncated case if we didn't write out the extent at all.
2943 if ((ret || !logical_len) &&
2944 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2945 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2946 btrfs_free_reserved_extent(root, ordered_extent->start,
2947 ordered_extent->disk_len, 1);
2952 * This needs to be done to make sure anybody waiting knows we are done
2953 * updating everything for this ordered extent.
2955 btrfs_remove_ordered_extent(inode, ordered_extent);
2957 /* for snapshot-aware defrag */
2960 free_sa_defrag_extent(new);
2961 atomic_dec(&root->fs_info->defrag_running);
2963 relink_file_extents(new);
2968 btrfs_put_ordered_extent(ordered_extent);
2969 /* once for the tree */
2970 btrfs_put_ordered_extent(ordered_extent);
2975 static void finish_ordered_fn(struct btrfs_work *work)
2977 struct btrfs_ordered_extent *ordered_extent;
2978 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2979 btrfs_finish_ordered_io(ordered_extent);
2982 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2983 struct extent_state *state, int uptodate)
2985 struct inode *inode = page->mapping->host;
2986 struct btrfs_root *root = BTRFS_I(inode)->root;
2987 struct btrfs_ordered_extent *ordered_extent = NULL;
2988 struct btrfs_workqueue *wq;
2989 btrfs_work_func_t func;
2991 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2993 ClearPagePrivate2(page);
2994 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2995 end - start + 1, uptodate))
2998 if (btrfs_is_free_space_inode(inode)) {
2999 wq = root->fs_info->endio_freespace_worker;
3000 func = btrfs_freespace_write_helper;
3002 wq = root->fs_info->endio_write_workers;
3003 func = btrfs_endio_write_helper;
3006 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3008 btrfs_queue_work(wq, &ordered_extent->work);
3013 static int __readpage_endio_check(struct inode *inode,
3014 struct btrfs_io_bio *io_bio,
3015 int icsum, struct page *page,
3016 int pgoff, u64 start, size_t len)
3021 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
3022 DEFAULT_RATELIMIT_BURST);
3024 csum_expected = *(((u32 *)io_bio->csum) + icsum);
3026 kaddr = kmap_atomic(page);
3027 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3028 btrfs_csum_final(csum, (char *)&csum);
3029 if (csum != csum_expected)
3032 kunmap_atomic(kaddr);
3035 if (__ratelimit(&_rs))
3036 btrfs_warn(BTRFS_I(inode)->root->fs_info,
3037 "csum failed ino %llu off %llu csum %u expected csum %u",
3038 btrfs_ino(inode), start, csum, csum_expected);
3039 memset(kaddr + pgoff, 1, len);
3040 flush_dcache_page(page);
3041 kunmap_atomic(kaddr);
3042 if (csum_expected == 0)
3048 * when reads are done, we need to check csums to verify the data is correct
3049 * if there's a match, we allow the bio to finish. If not, the code in
3050 * extent_io.c will try to find good copies for us.
3052 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3053 u64 phy_offset, struct page *page,
3054 u64 start, u64 end, int mirror)
3056 size_t offset = start - page_offset(page);
3057 struct inode *inode = page->mapping->host;
3058 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3059 struct btrfs_root *root = BTRFS_I(inode)->root;
3061 if (PageChecked(page)) {
3062 ClearPageChecked(page);
3066 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3069 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3070 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3071 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
3076 phy_offset >>= inode->i_sb->s_blocksize_bits;
3077 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3078 start, (size_t)(end - start + 1));
3081 struct delayed_iput {
3082 struct list_head list;
3083 struct inode *inode;
3086 /* JDM: If this is fs-wide, why can't we add a pointer to
3087 * btrfs_inode instead and avoid the allocation? */
3088 void btrfs_add_delayed_iput(struct inode *inode)
3090 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3091 struct delayed_iput *delayed;
3093 if (atomic_add_unless(&inode->i_count, -1, 1))
3096 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
3097 delayed->inode = inode;
3099 spin_lock(&fs_info->delayed_iput_lock);
3100 list_add_tail(&delayed->list, &fs_info->delayed_iputs);
3101 spin_unlock(&fs_info->delayed_iput_lock);
3104 void btrfs_run_delayed_iputs(struct btrfs_root *root)
3107 struct btrfs_fs_info *fs_info = root->fs_info;
3108 struct delayed_iput *delayed;
3111 spin_lock(&fs_info->delayed_iput_lock);
3112 empty = list_empty(&fs_info->delayed_iputs);
3113 spin_unlock(&fs_info->delayed_iput_lock);
3117 down_read(&fs_info->delayed_iput_sem);
3119 spin_lock(&fs_info->delayed_iput_lock);
3120 list_splice_init(&fs_info->delayed_iputs, &list);
3121 spin_unlock(&fs_info->delayed_iput_lock);
3123 while (!list_empty(&list)) {
3124 delayed = list_entry(list.next, struct delayed_iput, list);
3125 list_del(&delayed->list);
3126 iput(delayed->inode);
3130 up_read(&root->fs_info->delayed_iput_sem);
3134 * This is called in transaction commit time. If there are no orphan
3135 * files in the subvolume, it removes orphan item and frees block_rsv
3138 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3139 struct btrfs_root *root)
3141 struct btrfs_block_rsv *block_rsv;
3144 if (atomic_read(&root->orphan_inodes) ||
3145 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3148 spin_lock(&root->orphan_lock);
3149 if (atomic_read(&root->orphan_inodes)) {
3150 spin_unlock(&root->orphan_lock);
3154 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3155 spin_unlock(&root->orphan_lock);
3159 block_rsv = root->orphan_block_rsv;
3160 root->orphan_block_rsv = NULL;
3161 spin_unlock(&root->orphan_lock);
3163 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3164 btrfs_root_refs(&root->root_item) > 0) {
3165 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3166 root->root_key.objectid);
3168 btrfs_abort_transaction(trans, root, ret);
3170 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3175 WARN_ON(block_rsv->size > 0);
3176 btrfs_free_block_rsv(root, block_rsv);
3181 * This creates an orphan entry for the given inode in case something goes
3182 * wrong in the middle of an unlink/truncate.
3184 * NOTE: caller of this function should reserve 5 units of metadata for
3187 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3189 struct btrfs_root *root = BTRFS_I(inode)->root;
3190 struct btrfs_block_rsv *block_rsv = NULL;
3195 if (!root->orphan_block_rsv) {
3196 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3201 spin_lock(&root->orphan_lock);
3202 if (!root->orphan_block_rsv) {
3203 root->orphan_block_rsv = block_rsv;
3204 } else if (block_rsv) {
3205 btrfs_free_block_rsv(root, block_rsv);
3209 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3210 &BTRFS_I(inode)->runtime_flags)) {
3213 * For proper ENOSPC handling, we should do orphan
3214 * cleanup when mounting. But this introduces backward
3215 * compatibility issue.
3217 if (!xchg(&root->orphan_item_inserted, 1))
3223 atomic_inc(&root->orphan_inodes);
3226 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3227 &BTRFS_I(inode)->runtime_flags))
3229 spin_unlock(&root->orphan_lock);
3231 /* grab metadata reservation from transaction handle */
3233 ret = btrfs_orphan_reserve_metadata(trans, inode);
3234 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3237 /* insert an orphan item to track this unlinked/truncated file */
3239 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3241 atomic_dec(&root->orphan_inodes);
3243 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3244 &BTRFS_I(inode)->runtime_flags);
3245 btrfs_orphan_release_metadata(inode);
3247 if (ret != -EEXIST) {
3248 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3249 &BTRFS_I(inode)->runtime_flags);
3250 btrfs_abort_transaction(trans, root, ret);
3257 /* insert an orphan item to track subvolume contains orphan files */
3259 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3260 root->root_key.objectid);