1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/kernel.h>
8 #include <linux/buffer_head.h>
9 #include <linux/file.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/writeback.h>
18 #include <linux/compat.h>
19 #include <linux/xattr.h>
20 #include <linux/posix_acl.h>
21 #include <linux/falloc.h>
22 #include <linux/slab.h>
23 #include <linux/ratelimit.h>
24 #include <linux/btrfs.h>
25 #include <linux/blkdev.h>
26 #include <linux/posix_acl_xattr.h>
27 #include <linux/uio.h>
28 #include <linux/magic.h>
29 #include <linux/iversion.h>
30 #include <asm/unaligned.h>
33 #include "transaction.h"
34 #include "btrfs_inode.h"
35 #include "print-tree.h"
36 #include "ordered-data.h"
40 #include "compression.h"
42 #include "free-space-cache.h"
43 #include "inode-map.h"
49 struct btrfs_iget_args {
50 struct btrfs_key *location;
51 struct btrfs_root *root;
54 struct btrfs_dio_data {
56 u64 unsubmitted_oe_range_start;
57 u64 unsubmitted_oe_range_end;
61 static const struct inode_operations btrfs_dir_inode_operations;
62 static const struct inode_operations btrfs_symlink_inode_operations;
63 static const struct inode_operations btrfs_dir_ro_inode_operations;
64 static const struct inode_operations btrfs_special_inode_operations;
65 static const struct inode_operations btrfs_file_inode_operations;
66 static const struct address_space_operations btrfs_aops;
67 static const struct file_operations btrfs_dir_file_operations;
68 static const struct extent_io_ops btrfs_extent_io_ops;
70 static struct kmem_cache *btrfs_inode_cachep;
71 struct kmem_cache *btrfs_trans_handle_cachep;
72 struct kmem_cache *btrfs_path_cachep;
73 struct kmem_cache *btrfs_free_space_cachep;
76 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
77 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
78 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
79 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
80 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
81 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
82 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
83 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
86 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
87 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
88 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
89 static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, u64 delalloc_end,
92 int *page_started, unsigned long *nr_written,
93 int unlock, struct btrfs_dedupe_hash *hash);
94 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
95 u64 orig_start, u64 block_start,
96 u64 block_len, u64 orig_block_len,
97 u64 ram_bytes, int compress_type,
100 static void __endio_write_update_ordered(struct inode *inode,
101 const u64 offset, const u64 bytes,
102 const bool uptodate);
105 * Cleanup all submitted ordered extents in specified range to handle errors
106 * from the fill_dellaloc() callback.
108 * NOTE: caller must ensure that when an error happens, it can not call
109 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
110 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
111 * to be released, which we want to happen only when finishing the ordered
112 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
113 * fill_delalloc() callback already does proper cleanup for the first page of
114 * the range, that is, it invokes the callback writepage_end_io_hook() for the
115 * range of the first page.
117 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
121 unsigned long index = offset >> PAGE_SHIFT;
122 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
125 while (index <= end_index) {
126 page = find_get_page(inode->i_mapping, index);
130 ClearPagePrivate2(page);
133 return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
134 bytes - PAGE_SIZE, false);
137 static int btrfs_dirty_inode(struct inode *inode);
139 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
140 void btrfs_test_inode_set_ops(struct inode *inode)
142 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
146 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
147 struct inode *inode, struct inode *dir,
148 const struct qstr *qstr)
152 err = btrfs_init_acl(trans, inode, dir);
154 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
159 * this does all the hard work for inserting an inline extent into
160 * the btree. The caller should have done a btrfs_drop_extents so that
161 * no overlapping inline items exist in the btree
163 static int insert_inline_extent(struct btrfs_trans_handle *trans,
164 struct btrfs_path *path, int extent_inserted,
165 struct btrfs_root *root, struct inode *inode,
166 u64 start, size_t size, size_t compressed_size,
168 struct page **compressed_pages)
170 struct extent_buffer *leaf;
171 struct page *page = NULL;
174 struct btrfs_file_extent_item *ei;
176 size_t cur_size = size;
177 unsigned long offset;
179 if (compressed_size && compressed_pages)
180 cur_size = compressed_size;
182 inode_add_bytes(inode, size);
184 if (!extent_inserted) {
185 struct btrfs_key key;
188 key.objectid = btrfs_ino(BTRFS_I(inode));
190 key.type = BTRFS_EXTENT_DATA_KEY;
192 datasize = btrfs_file_extent_calc_inline_size(cur_size);
193 path->leave_spinning = 1;
194 ret = btrfs_insert_empty_item(trans, root, path, &key,
199 leaf = path->nodes[0];
200 ei = btrfs_item_ptr(leaf, path->slots[0],
201 struct btrfs_file_extent_item);
202 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
203 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
204 btrfs_set_file_extent_encryption(leaf, ei, 0);
205 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
206 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
207 ptr = btrfs_file_extent_inline_start(ei);
209 if (compress_type != BTRFS_COMPRESS_NONE) {
212 while (compressed_size > 0) {
213 cpage = compressed_pages[i];
214 cur_size = min_t(unsigned long, compressed_size,
217 kaddr = kmap_atomic(cpage);
218 write_extent_buffer(leaf, kaddr, ptr, cur_size);
219 kunmap_atomic(kaddr);
223 compressed_size -= cur_size;
225 btrfs_set_file_extent_compression(leaf, ei,
228 page = find_get_page(inode->i_mapping,
229 start >> PAGE_SHIFT);
230 btrfs_set_file_extent_compression(leaf, ei, 0);
231 kaddr = kmap_atomic(page);
232 offset = start & (PAGE_SIZE - 1);
233 write_extent_buffer(leaf, kaddr + offset, ptr, size);
234 kunmap_atomic(kaddr);
237 btrfs_mark_buffer_dirty(leaf);
238 btrfs_release_path(path);
241 * we're an inline extent, so nobody can
242 * extend the file past i_size without locking
243 * a page we already have locked.
245 * We must do any isize and inode updates
246 * before we unlock the pages. Otherwise we
247 * could end up racing with unlink.
249 BTRFS_I(inode)->disk_i_size = inode->i_size;
250 ret = btrfs_update_inode(trans, root, inode);
258 * conditionally insert an inline extent into the file. This
259 * does the checks required to make sure the data is small enough
260 * to fit as an inline extent.
262 static noinline int cow_file_range_inline(struct inode *inode, u64 start,
263 u64 end, size_t compressed_size,
265 struct page **compressed_pages)
267 struct btrfs_root *root = BTRFS_I(inode)->root;
268 struct btrfs_fs_info *fs_info = root->fs_info;
269 struct btrfs_trans_handle *trans;
270 u64 isize = i_size_read(inode);
271 u64 actual_end = min(end + 1, isize);
272 u64 inline_len = actual_end - start;
273 u64 aligned_end = ALIGN(end, fs_info->sectorsize);
274 u64 data_len = inline_len;
276 struct btrfs_path *path;
277 int extent_inserted = 0;
278 u32 extent_item_size;
281 data_len = compressed_size;
284 actual_end > fs_info->sectorsize ||
285 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
287 (actual_end & (fs_info->sectorsize - 1)) == 0) ||
289 data_len > fs_info->max_inline) {
293 path = btrfs_alloc_path();
297 trans = btrfs_join_transaction(root);
299 btrfs_free_path(path);
300 return PTR_ERR(trans);
302 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
304 if (compressed_size && compressed_pages)
305 extent_item_size = btrfs_file_extent_calc_inline_size(
308 extent_item_size = btrfs_file_extent_calc_inline_size(
311 ret = __btrfs_drop_extents(trans, root, inode, path,
312 start, aligned_end, NULL,
313 1, 1, extent_item_size, &extent_inserted);
315 btrfs_abort_transaction(trans, ret);
319 if (isize > actual_end)
320 inline_len = min_t(u64, isize, actual_end);
321 ret = insert_inline_extent(trans, path, extent_inserted,
323 inline_len, compressed_size,
324 compress_type, compressed_pages);
325 if (ret && ret != -ENOSPC) {
326 btrfs_abort_transaction(trans, ret);
328 } else if (ret == -ENOSPC) {
333 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
334 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
337 * Don't forget to free the reserved space, as for inlined extent
338 * it won't count as data extent, free them directly here.
339 * And at reserve time, it's always aligned to page size, so
340 * just free one page here.
342 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
343 btrfs_free_path(path);
344 btrfs_end_transaction(trans);
348 struct async_extent {
353 unsigned long nr_pages;
355 struct list_head list;
360 struct btrfs_root *root;
361 struct page *locked_page;
364 unsigned int write_flags;
365 struct list_head extents;
366 struct btrfs_work work;
369 static noinline int add_async_extent(struct async_cow *cow,
370 u64 start, u64 ram_size,
373 unsigned long nr_pages,
376 struct async_extent *async_extent;
378 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
379 BUG_ON(!async_extent); /* -ENOMEM */
380 async_extent->start = start;
381 async_extent->ram_size = ram_size;
382 async_extent->compressed_size = compressed_size;
383 async_extent->pages = pages;
384 async_extent->nr_pages = nr_pages;
385 async_extent->compress_type = compress_type;
386 list_add_tail(&async_extent->list, &cow->extents);
390 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
392 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
395 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
398 if (BTRFS_I(inode)->defrag_compress)
400 /* bad compression ratios */
401 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
403 if (btrfs_test_opt(fs_info, COMPRESS) ||
404 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
405 BTRFS_I(inode)->prop_compress)
406 return btrfs_compress_heuristic(inode, start, end);
410 static inline void inode_should_defrag(struct btrfs_inode *inode,
411 u64 start, u64 end, u64 num_bytes, u64 small_write)
413 /* If this is a small write inside eof, kick off a defrag */
414 if (num_bytes < small_write &&
415 (start > 0 || end + 1 < inode->disk_i_size))
416 btrfs_add_inode_defrag(NULL, inode);
420 * we create compressed extents in two phases. The first
421 * phase compresses a range of pages that have already been
422 * locked (both pages and state bits are locked).
424 * This is done inside an ordered work queue, and the compression
425 * is spread across many cpus. The actual IO submission is step
426 * two, and the ordered work queue takes care of making sure that
427 * happens in the same order things were put onto the queue by
428 * writepages and friends.
430 * If this code finds it can't get good compression, it puts an
431 * entry onto the work queue to write the uncompressed bytes. This
432 * makes sure that both compressed inodes and uncompressed inodes
433 * are written in the same order that the flusher thread sent them
436 static noinline void compress_file_range(struct inode *inode,
437 struct page *locked_page,
439 struct async_cow *async_cow,
442 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
443 u64 blocksize = fs_info->sectorsize;
445 u64 isize = i_size_read(inode);
447 struct page **pages = NULL;
448 unsigned long nr_pages;
449 unsigned long total_compressed = 0;
450 unsigned long total_in = 0;
453 int compress_type = fs_info->compress_type;
456 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
459 actual_end = min_t(u64, isize, end + 1);
462 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
463 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
464 nr_pages = min_t(unsigned long, nr_pages,
465 BTRFS_MAX_COMPRESSED / PAGE_SIZE);
468 * we don't want to send crud past the end of i_size through
469 * compression, that's just a waste of CPU time. So, if the
470 * end of the file is before the start of our current
471 * requested range of bytes, we bail out to the uncompressed
472 * cleanup code that can deal with all of this.
474 * It isn't really the fastest way to fix things, but this is a
475 * very uncommon corner.
477 if (actual_end <= start)
478 goto cleanup_and_bail_uncompressed;
480 total_compressed = actual_end - start;
483 * skip compression for a small file range(<=blocksize) that
484 * isn't an inline extent, since it doesn't save disk space at all.
486 if (total_compressed <= blocksize &&
487 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
488 goto cleanup_and_bail_uncompressed;
490 total_compressed = min_t(unsigned long, total_compressed,
491 BTRFS_MAX_UNCOMPRESSED);
496 * we do compression for mount -o compress and when the
497 * inode has not been flagged as nocompress. This flag can
498 * change at any time if we discover bad compression ratios.
500 if (inode_need_compress(inode, start, end)) {
502 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
504 /* just bail out to the uncompressed code */
508 if (BTRFS_I(inode)->defrag_compress)
509 compress_type = BTRFS_I(inode)->defrag_compress;
510 else if (BTRFS_I(inode)->prop_compress)
511 compress_type = BTRFS_I(inode)->prop_compress;
514 * we need to call clear_page_dirty_for_io on each
515 * page in the range. Otherwise applications with the file
516 * mmap'd can wander in and change the page contents while
517 * we are compressing them.
519 * If the compression fails for any reason, we set the pages
520 * dirty again later on.
522 * Note that the remaining part is redirtied, the start pointer
523 * has moved, the end is the original one.
526 extent_range_clear_dirty_for_io(inode, start, end);
530 /* Compression level is applied here and only here */
531 ret = btrfs_compress_pages(
532 compress_type | (fs_info->compress_level << 4),
533 inode->i_mapping, start,
540 unsigned long offset = total_compressed &
542 struct page *page = pages[nr_pages - 1];
545 /* zero the tail end of the last page, we might be
546 * sending it down to disk
549 kaddr = kmap_atomic(page);
550 memset(kaddr + offset, 0,
552 kunmap_atomic(kaddr);
559 /* lets try to make an inline extent */
560 if (ret || total_in < actual_end) {
561 /* we didn't compress the entire range, try
562 * to make an uncompressed inline extent.
564 ret = cow_file_range_inline(inode, start, end, 0,
565 BTRFS_COMPRESS_NONE, NULL);
567 /* try making a compressed inline extent */
568 ret = cow_file_range_inline(inode, start, end,
570 compress_type, pages);
573 unsigned long clear_flags = EXTENT_DELALLOC |
574 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
575 EXTENT_DO_ACCOUNTING;
576 unsigned long page_error_op;
578 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
581 * inline extent creation worked or returned error,
582 * we don't need to create any more async work items.
583 * Unlock and free up our temp pages.
585 * We use DO_ACCOUNTING here because we need the
586 * delalloc_release_metadata to be done _after_ we drop
587 * our outstanding extent for clearing delalloc for this
590 extent_clear_unlock_delalloc(inode, start, end, end,
603 * we aren't doing an inline extent round the compressed size
604 * up to a block size boundary so the allocator does sane
607 total_compressed = ALIGN(total_compressed, blocksize);
610 * one last check to make sure the compression is really a
611 * win, compare the page count read with the blocks on disk,
612 * compression must free at least one sector size
614 total_in = ALIGN(total_in, PAGE_SIZE);
615 if (total_compressed + blocksize <= total_in) {
619 * The async work queues will take care of doing actual
620 * allocation on disk for these compressed pages, and
621 * will submit them to the elevator.
623 add_async_extent(async_cow, start, total_in,
624 total_compressed, pages, nr_pages,
627 if (start + total_in < end) {
638 * the compression code ran but failed to make things smaller,
639 * free any pages it allocated and our page pointer array
641 for (i = 0; i < nr_pages; i++) {
642 WARN_ON(pages[i]->mapping);
647 total_compressed = 0;
650 /* flag the file so we don't compress in the future */
651 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
652 !(BTRFS_I(inode)->prop_compress)) {
653 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
656 cleanup_and_bail_uncompressed:
658 * No compression, but we still need to write the pages in the file
659 * we've been given so far. redirty the locked page if it corresponds
660 * to our extent and set things up for the async work queue to run
661 * cow_file_range to do the normal delalloc dance.
663 if (page_offset(locked_page) >= start &&
664 page_offset(locked_page) <= end)
665 __set_page_dirty_nobuffers(locked_page);
666 /* unlocked later on in the async handlers */
669 extent_range_redirty_for_io(inode, start, end);
670 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
671 BTRFS_COMPRESS_NONE);
677 for (i = 0; i < nr_pages; i++) {
678 WARN_ON(pages[i]->mapping);
684 static void free_async_extent_pages(struct async_extent *async_extent)
688 if (!async_extent->pages)
691 for (i = 0; i < async_extent->nr_pages; i++) {
692 WARN_ON(async_extent->pages[i]->mapping);
693 put_page(async_extent->pages[i]);
695 kfree(async_extent->pages);
696 async_extent->nr_pages = 0;
697 async_extent->pages = NULL;
701 * phase two of compressed writeback. This is the ordered portion
702 * of the code, which only gets called in the order the work was
703 * queued. We walk all the async extents created by compress_file_range
704 * and send them down to the disk.
706 static noinline void submit_compressed_extents(struct inode *inode,
707 struct async_cow *async_cow)
709 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
710 struct async_extent *async_extent;
712 struct btrfs_key ins;
713 struct extent_map *em;
714 struct btrfs_root *root = BTRFS_I(inode)->root;
715 struct extent_io_tree *io_tree;
719 while (!list_empty(&async_cow->extents)) {
720 async_extent = list_entry(async_cow->extents.next,
721 struct async_extent, list);
722 list_del(&async_extent->list);
724 io_tree = &BTRFS_I(inode)->io_tree;
727 /* did the compression code fall back to uncompressed IO? */
728 if (!async_extent->pages) {
729 int page_started = 0;
730 unsigned long nr_written = 0;
732 lock_extent(io_tree, async_extent->start,
733 async_extent->start +
734 async_extent->ram_size - 1);
736 /* allocate blocks */
737 ret = cow_file_range(inode, async_cow->locked_page,
739 async_extent->start +
740 async_extent->ram_size - 1,
741 async_extent->start +
742 async_extent->ram_size - 1,
743 &page_started, &nr_written, 0,
749 * if page_started, cow_file_range inserted an
750 * inline extent and took care of all the unlocking
751 * and IO for us. Otherwise, we need to submit
752 * all those pages down to the drive.
754 if (!page_started && !ret)
755 extent_write_locked_range(inode,
757 async_extent->start +
758 async_extent->ram_size - 1,
761 unlock_page(async_cow->locked_page);
767 lock_extent(io_tree, async_extent->start,
768 async_extent->start + async_extent->ram_size - 1);
770 ret = btrfs_reserve_extent(root, async_extent->ram_size,
771 async_extent->compressed_size,
772 async_extent->compressed_size,
773 0, alloc_hint, &ins, 1, 1);
775 free_async_extent_pages(async_extent);
777 if (ret == -ENOSPC) {
778 unlock_extent(io_tree, async_extent->start,
779 async_extent->start +
780 async_extent->ram_size - 1);
783 * we need to redirty the pages if we decide to
784 * fallback to uncompressed IO, otherwise we
785 * will not submit these pages down to lower
788 extent_range_redirty_for_io(inode,
790 async_extent->start +
791 async_extent->ram_size - 1);
798 * here we're doing allocation and writeback of the
801 em = create_io_em(inode, async_extent->start,
802 async_extent->ram_size, /* len */
803 async_extent->start, /* orig_start */
804 ins.objectid, /* block_start */
805 ins.offset, /* block_len */
806 ins.offset, /* orig_block_len */
807 async_extent->ram_size, /* ram_bytes */
808 async_extent->compress_type,
809 BTRFS_ORDERED_COMPRESSED);
811 /* ret value is not necessary due to void function */
812 goto out_free_reserve;
815 ret = btrfs_add_ordered_extent_compress(inode,
818 async_extent->ram_size,
820 BTRFS_ORDERED_COMPRESSED,
821 async_extent->compress_type);
823 btrfs_drop_extent_cache(BTRFS_I(inode),
825 async_extent->start +
826 async_extent->ram_size - 1, 0);
827 goto out_free_reserve;
829 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
832 * clear dirty, set writeback and unlock the pages.
834 extent_clear_unlock_delalloc(inode, async_extent->start,
835 async_extent->start +
836 async_extent->ram_size - 1,
837 async_extent->start +
838 async_extent->ram_size - 1,
839 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
840 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
842 if (btrfs_submit_compressed_write(inode,
844 async_extent->ram_size,
846 ins.offset, async_extent->pages,
847 async_extent->nr_pages,
848 async_cow->write_flags)) {
849 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
850 struct page *p = async_extent->pages[0];
851 const u64 start = async_extent->start;
852 const u64 end = start + async_extent->ram_size - 1;
854 p->mapping = inode->i_mapping;
855 tree->ops->writepage_end_io_hook(p, start, end,
858 extent_clear_unlock_delalloc(inode, start, end, end,
862 free_async_extent_pages(async_extent);
864 alloc_hint = ins.objectid + ins.offset;
870 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
871 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
873 extent_clear_unlock_delalloc(inode, async_extent->start,
874 async_extent->start +
875 async_extent->ram_size - 1,
876 async_extent->start +
877 async_extent->ram_size - 1,
878 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
879 EXTENT_DELALLOC_NEW |
880 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
881 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
882 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
884 free_async_extent_pages(async_extent);
889 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
892 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
893 struct extent_map *em;
896 read_lock(&em_tree->lock);
897 em = search_extent_mapping(em_tree, start, num_bytes);
900 * if block start isn't an actual block number then find the
901 * first block in this inode and use that as a hint. If that
902 * block is also bogus then just don't worry about it.
904 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
906 em = search_extent_mapping(em_tree, 0, 0);
907 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
908 alloc_hint = em->block_start;
912 alloc_hint = em->block_start;
916 read_unlock(&em_tree->lock);
922 * when extent_io.c finds a delayed allocation range in the file,
923 * the call backs end up in this code. The basic idea is to
924 * allocate extents on disk for the range, and create ordered data structs
925 * in ram to track those extents.
927 * locked_page is the page that writepage had locked already. We use
928 * it to make sure we don't do extra locks or unlocks.
930 * *page_started is set to one if we unlock locked_page and do everything
931 * required to start IO on it. It may be clean and already done with
934 static noinline int cow_file_range(struct inode *inode,
935 struct page *locked_page,
936 u64 start, u64 end, u64 delalloc_end,
937 int *page_started, unsigned long *nr_written,
938 int unlock, struct btrfs_dedupe_hash *hash)
940 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
941 struct btrfs_root *root = BTRFS_I(inode)->root;
944 unsigned long ram_size;
945 u64 cur_alloc_size = 0;
946 u64 blocksize = fs_info->sectorsize;
947 struct btrfs_key ins;
948 struct extent_map *em;
950 unsigned long page_ops;
951 bool extent_reserved = false;
954 if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
960 num_bytes = ALIGN(end - start + 1, blocksize);
961 num_bytes = max(blocksize, num_bytes);
962 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
964 inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
967 /* lets try to make an inline extent */
968 ret = cow_file_range_inline(inode, start, end, 0,
969 BTRFS_COMPRESS_NONE, NULL);
972 * We use DO_ACCOUNTING here because we need the
973 * delalloc_release_metadata to be run _after_ we drop
974 * our outstanding extent for clearing delalloc for this
977 extent_clear_unlock_delalloc(inode, start, end,
979 EXTENT_LOCKED | EXTENT_DELALLOC |
980 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
981 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
982 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
984 *nr_written = *nr_written +
985 (end - start + PAGE_SIZE) / PAGE_SIZE;
988 } else if (ret < 0) {
993 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
994 btrfs_drop_extent_cache(BTRFS_I(inode), start,
995 start + num_bytes - 1, 0);
997 while (num_bytes > 0) {
998 cur_alloc_size = num_bytes;
999 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1000 fs_info->sectorsize, 0, alloc_hint,
1004 cur_alloc_size = ins.offset;
1005 extent_reserved = true;
1007 ram_size = ins.offset;
1008 em = create_io_em(inode, start, ins.offset, /* len */
1009 start, /* orig_start */
1010 ins.objectid, /* block_start */
1011 ins.offset, /* block_len */
1012 ins.offset, /* orig_block_len */
1013 ram_size, /* ram_bytes */
1014 BTRFS_COMPRESS_NONE, /* compress_type */
1015 BTRFS_ORDERED_REGULAR /* type */);
1020 free_extent_map(em);
1022 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1023 ram_size, cur_alloc_size, 0);
1025 goto out_drop_extent_cache;
1027 if (root->root_key.objectid ==
1028 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1029 ret = btrfs_reloc_clone_csums(inode, start,
1032 * Only drop cache here, and process as normal.
1034 * We must not allow extent_clear_unlock_delalloc()
1035 * at out_unlock label to free meta of this ordered
1036 * extent, as its meta should be freed by
1037 * btrfs_finish_ordered_io().
1039 * So we must continue until @start is increased to
1040 * skip current ordered extent.
1043 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1044 start + ram_size - 1, 0);
1047 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1049 /* we're not doing compressed IO, don't unlock the first
1050 * page (which the caller expects to stay locked), don't
1051 * clear any dirty bits and don't set any writeback bits
1053 * Do set the Private2 bit so we know this page was properly
1054 * setup for writepage
1056 page_ops = unlock ? PAGE_UNLOCK : 0;
1057 page_ops |= PAGE_SET_PRIVATE2;
1059 extent_clear_unlock_delalloc(inode, start,
1060 start + ram_size - 1,
1061 delalloc_end, locked_page,
1062 EXTENT_LOCKED | EXTENT_DELALLOC,
1064 if (num_bytes < cur_alloc_size)
1067 num_bytes -= cur_alloc_size;
1068 alloc_hint = ins.objectid + ins.offset;
1069 start += cur_alloc_size;
1070 extent_reserved = false;
1073 * btrfs_reloc_clone_csums() error, since start is increased
1074 * extent_clear_unlock_delalloc() at out_unlock label won't
1075 * free metadata of current ordered extent, we're OK to exit.
1083 out_drop_extent_cache:
1084 btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1086 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1087 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1089 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1090 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1091 page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1094 * If we reserved an extent for our delalloc range (or a subrange) and
1095 * failed to create the respective ordered extent, then it means that
1096 * when we reserved the extent we decremented the extent's size from
1097 * the data space_info's bytes_may_use counter and incremented the
1098 * space_info's bytes_reserved counter by the same amount. We must make
1099 * sure extent_clear_unlock_delalloc() does not try to decrement again
1100 * the data space_info's bytes_may_use counter, therefore we do not pass
1101 * it the flag EXTENT_CLEAR_DATA_RESV.
1103 if (extent_reserved) {
1104 extent_clear_unlock_delalloc(inode, start,
1105 start + cur_alloc_size,
1106 start + cur_alloc_size,
1110 start += cur_alloc_size;
1114 extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1116 clear_bits | EXTENT_CLEAR_DATA_RESV,
1122 * work queue call back to started compression on a file and pages
1124 static noinline void async_cow_start(struct btrfs_work *work)
1126 struct async_cow *async_cow;
1128 async_cow = container_of(work, struct async_cow, work);
1130 compress_file_range(async_cow->inode, async_cow->locked_page,
1131 async_cow->start, async_cow->end, async_cow,
1133 if (num_added == 0) {
1134 btrfs_add_delayed_iput(async_cow->inode);
1135 async_cow->inode = NULL;
1140 * work queue call back to submit previously compressed pages
1142 static noinline void async_cow_submit(struct btrfs_work *work)
1144 struct btrfs_fs_info *fs_info;
1145 struct async_cow *async_cow;
1146 struct btrfs_root *root;
1147 unsigned long nr_pages;
1149 async_cow = container_of(work, struct async_cow, work);
1151 root = async_cow->root;
1152 fs_info = root->fs_info;
1153 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1156 /* atomic_sub_return implies a barrier */
1157 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1159 cond_wake_up_nomb(&fs_info->async_submit_wait);
1161 if (async_cow->inode)
1162 submit_compressed_extents(async_cow->inode, async_cow);
1165 static noinline void async_cow_free(struct btrfs_work *work)
1167 struct async_cow *async_cow;
1168 async_cow = container_of(work, struct async_cow, work);
1169 if (async_cow->inode)
1170 btrfs_add_delayed_iput(async_cow->inode);
1174 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1175 u64 start, u64 end, int *page_started,
1176 unsigned long *nr_written,
1177 unsigned int write_flags)
1179 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1180 struct async_cow *async_cow;
1181 struct btrfs_root *root = BTRFS_I(inode)->root;
1182 unsigned long nr_pages;
1185 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1187 while (start < end) {
1188 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1189 BUG_ON(!async_cow); /* -ENOMEM */
1190 async_cow->inode = igrab(inode);
1191 async_cow->root = root;
1192 async_cow->locked_page = locked_page;
1193 async_cow->start = start;
1194 async_cow->write_flags = write_flags;
1196 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1197 !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1200 cur_end = min(end, start + SZ_512K - 1);
1202 async_cow->end = cur_end;
1203 INIT_LIST_HEAD(&async_cow->extents);
1205 btrfs_init_work(&async_cow->work,
1206 btrfs_delalloc_helper,
1207 async_cow_start, async_cow_submit,
1210 nr_pages = (cur_end - start + PAGE_SIZE) >>
1212 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1214 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1216 *nr_written += nr_pages;
1217 start = cur_end + 1;
1223 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1224 u64 bytenr, u64 num_bytes)
1227 struct btrfs_ordered_sum *sums;
1230 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1231 bytenr + num_bytes - 1, &list, 0);
1232 if (ret == 0 && list_empty(&list))
1235 while (!list_empty(&list)) {
1236 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1237 list_del(&sums->list);
1246 * when nowcow writeback call back. This checks for snapshots or COW copies
1247 * of the extents that exist in the file, and COWs the file as required.
1249 * If no cow copies or snapshots exist, we write directly to the existing
1252 static noinline int run_delalloc_nocow(struct inode *inode,
1253 struct page *locked_page,
1254 u64 start, u64 end, int *page_started, int force,
1255 unsigned long *nr_written)
1257 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1258 struct btrfs_root *root = BTRFS_I(inode)->root;
1259 struct extent_buffer *leaf;
1260 struct btrfs_path *path;
1261 struct btrfs_file_extent_item *fi;
1262 struct btrfs_key found_key;
1263 struct extent_map *em;
1278 u64 ino = btrfs_ino(BTRFS_I(inode));
1280 path = btrfs_alloc_path();
1282 extent_clear_unlock_delalloc(inode, start, end, end,
1284 EXTENT_LOCKED | EXTENT_DELALLOC |
1285 EXTENT_DO_ACCOUNTING |
1286 EXTENT_DEFRAG, PAGE_UNLOCK |
1288 PAGE_SET_WRITEBACK |
1289 PAGE_END_WRITEBACK);
1293 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1295 cow_start = (u64)-1;
1298 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1302 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1303 leaf = path->nodes[0];
1304 btrfs_item_key_to_cpu(leaf, &found_key,
1305 path->slots[0] - 1);
1306 if (found_key.objectid == ino &&
1307 found_key.type == BTRFS_EXTENT_DATA_KEY)
1312 leaf = path->nodes[0];
1313 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1314 ret = btrfs_next_leaf(root, path);
1316 if (cow_start != (u64)-1)
1317 cur_offset = cow_start;
1322 leaf = path->nodes[0];
1328 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1330 if (found_key.objectid > ino)
1332 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1333 found_key.type < BTRFS_EXTENT_DATA_KEY) {
1337 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1338 found_key.offset > end)
1341 if (found_key.offset > cur_offset) {
1342 extent_end = found_key.offset;
1347 fi = btrfs_item_ptr(leaf, path->slots[0],
1348 struct btrfs_file_extent_item);
1349 extent_type = btrfs_file_extent_type(leaf, fi);
1351 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1352 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1353 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1354 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1355 extent_offset = btrfs_file_extent_offset(leaf, fi);
1356 extent_end = found_key.offset +
1357 btrfs_file_extent_num_bytes(leaf, fi);
1359 btrfs_file_extent_disk_num_bytes(leaf, fi);
1360 if (extent_end <= start) {
1364 if (disk_bytenr == 0)
1366 if (btrfs_file_extent_compression(leaf, fi) ||
1367 btrfs_file_extent_encryption(leaf, fi) ||
1368 btrfs_file_extent_other_encoding(leaf, fi))
1371 * Do the same check as in btrfs_cross_ref_exist but
1372 * without the unnecessary search.
1374 if (btrfs_file_extent_generation(leaf, fi) <=
1375 btrfs_root_last_snapshot(&root->root_item))
1377 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1379 if (btrfs_extent_readonly(fs_info, disk_bytenr))
1381 ret = btrfs_cross_ref_exist(root, ino,
1383 extent_offset, disk_bytenr);
1386 * ret could be -EIO if the above fails to read
1390 if (cow_start != (u64)-1)
1391 cur_offset = cow_start;
1395 WARN_ON_ONCE(nolock);
1398 disk_bytenr += extent_offset;
1399 disk_bytenr += cur_offset - found_key.offset;
1400 num_bytes = min(end + 1, extent_end) - cur_offset;
1402 * if there are pending snapshots for this root,
1403 * we fall into common COW way.
1405 if (!nolock && atomic_read(&root->snapshot_force_cow))
1408 * force cow if csum exists in the range.
1409 * this ensure that csum for a given extent are
1410 * either valid or do not exist.
1412 ret = csum_exist_in_range(fs_info, disk_bytenr,
1416 * ret could be -EIO if the above fails to read
1420 if (cow_start != (u64)-1)
1421 cur_offset = cow_start;
1424 WARN_ON_ONCE(nolock);
1427 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1430 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1431 extent_end = found_key.offset +
1432 btrfs_file_extent_ram_bytes(leaf, fi);
1433 extent_end = ALIGN(extent_end,
1434 fs_info->sectorsize);
1439 if (extent_end <= start) {
1442 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1446 if (cow_start == (u64)-1)
1447 cow_start = cur_offset;
1448 cur_offset = extent_end;
1449 if (cur_offset > end)
1455 btrfs_release_path(path);
1456 if (cow_start != (u64)-1) {
1457 ret = cow_file_range(inode, locked_page,
1458 cow_start, found_key.offset - 1,
1459 end, page_started, nr_written, 1,
1463 btrfs_dec_nocow_writers(fs_info,
1467 cow_start = (u64)-1;
1470 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1471 u64 orig_start = found_key.offset - extent_offset;
1473 em = create_io_em(inode, cur_offset, num_bytes,
1475 disk_bytenr, /* block_start */
1476 num_bytes, /* block_len */
1477 disk_num_bytes, /* orig_block_len */
1478 ram_bytes, BTRFS_COMPRESS_NONE,
1479 BTRFS_ORDERED_PREALLOC);
1482 btrfs_dec_nocow_writers(fs_info,
1487 free_extent_map(em);
1490 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1491 type = BTRFS_ORDERED_PREALLOC;
1493 type = BTRFS_ORDERED_NOCOW;
1496 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1497 num_bytes, num_bytes, type);
1499 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1500 BUG_ON(ret); /* -ENOMEM */
1502 if (root->root_key.objectid ==
1503 BTRFS_DATA_RELOC_TREE_OBJECTID)
1505 * Error handled later, as we must prevent
1506 * extent_clear_unlock_delalloc() in error handler
1507 * from freeing metadata of created ordered extent.
1509 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1512 extent_clear_unlock_delalloc(inode, cur_offset,
1513 cur_offset + num_bytes - 1, end,
1514 locked_page, EXTENT_LOCKED |
1516 EXTENT_CLEAR_DATA_RESV,
1517 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1519 cur_offset = extent_end;
1522 * btrfs_reloc_clone_csums() error, now we're OK to call error
1523 * handler, as metadata for created ordered extent will only
1524 * be freed by btrfs_finish_ordered_io().
1528 if (cur_offset > end)
1531 btrfs_release_path(path);
1533 if (cur_offset <= end && cow_start == (u64)-1) {
1534 cow_start = cur_offset;
1538 if (cow_start != (u64)-1) {
1539 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1540 page_started, nr_written, 1, NULL);
1546 if (ret && cur_offset < end)
1547 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1548 locked_page, EXTENT_LOCKED |
1549 EXTENT_DELALLOC | EXTENT_DEFRAG |
1550 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1552 PAGE_SET_WRITEBACK |
1553 PAGE_END_WRITEBACK);
1554 btrfs_free_path(path);
1558 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1561 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1562 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1566 * @defrag_bytes is a hint value, no spinlock held here,
1567 * if is not zero, it means the file is defragging.
1568 * Force cow if given extent needs to be defragged.
1570 if (BTRFS_I(inode)->defrag_bytes &&
1571 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1572 EXTENT_DEFRAG, 0, NULL))
1579 * extent_io.c call back to do delayed allocation processing
1581 static int run_delalloc_range(void *private_data, struct page *locked_page,
1582 u64 start, u64 end, int *page_started,
1583 unsigned long *nr_written,
1584 struct writeback_control *wbc)
1586 struct inode *inode = private_data;
1588 int force_cow = need_force_cow(inode, start, end);
1589 unsigned int write_flags = wbc_to_write_flags(wbc);
1591 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1592 ret = run_delalloc_nocow(inode, locked_page, start, end,
1593 page_started, 1, nr_written);
1594 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1595 ret = run_delalloc_nocow(inode, locked_page, start, end,
1596 page_started, 0, nr_written);
1597 } else if (!inode_need_compress(inode, start, end)) {
1598 ret = cow_file_range(inode, locked_page, start, end, end,
1599 page_started, nr_written, 1, NULL);
1601 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1602 &BTRFS_I(inode)->runtime_flags);
1603 ret = cow_file_range_async(inode, locked_page, start, end,
1604 page_started, nr_written,
1608 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1612 static void btrfs_split_extent_hook(void *private_data,
1613 struct extent_state *orig, u64 split)
1615 struct inode *inode = private_data;
1618 /* not delalloc, ignore it */
1619 if (!(orig->state & EXTENT_DELALLOC))
1622 size = orig->end - orig->start + 1;
1623 if (size > BTRFS_MAX_EXTENT_SIZE) {
1628 * See the explanation in btrfs_merge_extent_hook, the same
1629 * applies here, just in reverse.
1631 new_size = orig->end - split + 1;
1632 num_extents = count_max_extents(new_size);
1633 new_size = split - orig->start;
1634 num_extents += count_max_extents(new_size);
1635 if (count_max_extents(size) >= num_extents)
1639 spin_lock(&BTRFS_I(inode)->lock);
1640 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1641 spin_unlock(&BTRFS_I(inode)->lock);
1645 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1646 * extents so we can keep track of new extents that are just merged onto old
1647 * extents, such as when we are doing sequential writes, so we can properly
1648 * account for the metadata space we'll need.
1650 static void btrfs_merge_extent_hook(void *private_data,
1651 struct extent_state *new,
1652 struct extent_state *other)
1654 struct inode *inode = private_data;
1655 u64 new_size, old_size;
1658 /* not delalloc, ignore it */
1659 if (!(other->state & EXTENT_DELALLOC))
1662 if (new->start > other->start)
1663 new_size = new->end - other->start + 1;
1665 new_size = other->end - new->start + 1;
1667 /* we're not bigger than the max, unreserve the space and go */
1668 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1669 spin_lock(&BTRFS_I(inode)->lock);
1670 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1671 spin_unlock(&BTRFS_I(inode)->lock);
1676 * We have to add up either side to figure out how many extents were
1677 * accounted for before we merged into one big extent. If the number of
1678 * extents we accounted for is <= the amount we need for the new range
1679 * then we can return, otherwise drop. Think of it like this
1683 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1684 * need 2 outstanding extents, on one side we have 1 and the other side
1685 * we have 1 so they are == and we can return. But in this case
1687 * [MAX_SIZE+4k][MAX_SIZE+4k]
1689 * Each range on their own accounts for 2 extents, but merged together
1690 * they are only 3 extents worth of accounting, so we need to drop in
1693 old_size = other->end - other->start + 1;
1694 num_extents = count_max_extents(old_size);
1695 old_size = new->end - new->start + 1;
1696 num_extents += count_max_extents(old_size);
1697 if (count_max_extents(new_size) >= num_extents)
1700 spin_lock(&BTRFS_I(inode)->lock);
1701 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1702 spin_unlock(&BTRFS_I(inode)->lock);
1705 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1706 struct inode *inode)
1708 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1710 spin_lock(&root->delalloc_lock);
1711 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1712 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1713 &root->delalloc_inodes);
1714 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1715 &BTRFS_I(inode)->runtime_flags);
1716 root->nr_delalloc_inodes++;
1717 if (root->nr_delalloc_inodes == 1) {
1718 spin_lock(&fs_info->delalloc_root_lock);
1719 BUG_ON(!list_empty(&root->delalloc_root));
1720 list_add_tail(&root->delalloc_root,
1721 &fs_info->delalloc_roots);
1722 spin_unlock(&fs_info->delalloc_root_lock);
1725 spin_unlock(&root->delalloc_lock);
1729 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
1730 struct btrfs_inode *inode)
1732 struct btrfs_fs_info *fs_info = root->fs_info;
1734 if (!list_empty(&inode->delalloc_inodes)) {
1735 list_del_init(&inode->delalloc_inodes);
1736 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1737 &inode->runtime_flags);
1738 root->nr_delalloc_inodes--;
1739 if (!root->nr_delalloc_inodes) {
1740 ASSERT(list_empty(&root->delalloc_inodes));
1741 spin_lock(&fs_info->delalloc_root_lock);
1742 BUG_ON(list_empty(&root->delalloc_root));
1743 list_del_init(&root->delalloc_root);
1744 spin_unlock(&fs_info->delalloc_root_lock);
1749 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1750 struct btrfs_inode *inode)
1752 spin_lock(&root->delalloc_lock);
1753 __btrfs_del_delalloc_inode(root, inode);
1754 spin_unlock(&root->delalloc_lock);
1758 * extent_io.c set_bit_hook, used to track delayed allocation
1759 * bytes in this file, and to maintain the list of inodes that
1760 * have pending delalloc work to be done.
1762 static void btrfs_set_bit_hook(void *private_data,
1763 struct extent_state *state, unsigned *bits)
1765 struct inode *inode = private_data;
1767 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1769 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1772 * set_bit and clear bit hooks normally require _irqsave/restore
1773 * but in this case, we are only testing for the DELALLOC
1774 * bit, which is only set or cleared with irqs on
1776 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1777 struct btrfs_root *root = BTRFS_I(inode)->root;
1778 u64 len = state->end + 1 - state->start;
1779 u32 num_extents = count_max_extents(len);
1780 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1782 spin_lock(&BTRFS_I(inode)->lock);
1783 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1784 spin_unlock(&BTRFS_I(inode)->lock);
1786 /* For sanity tests */
1787 if (btrfs_is_testing(fs_info))
1790 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1791 fs_info->delalloc_batch);
1792 spin_lock(&BTRFS_I(inode)->lock);
1793 BTRFS_I(inode)->delalloc_bytes += len;
1794 if (*bits & EXTENT_DEFRAG)
1795 BTRFS_I(inode)->defrag_bytes += len;
1796 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1797 &BTRFS_I(inode)->runtime_flags))
1798 btrfs_add_delalloc_inodes(root, inode);
1799 spin_unlock(&BTRFS_I(inode)->lock);
1802 if (!(state->state & EXTENT_DELALLOC_NEW) &&
1803 (*bits & EXTENT_DELALLOC_NEW)) {
1804 spin_lock(&BTRFS_I(inode)->lock);
1805 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1807 spin_unlock(&BTRFS_I(inode)->lock);
1812 * extent_io.c clear_bit_hook, see set_bit_hook for why
1814 static void btrfs_clear_bit_hook(void *private_data,
1815 struct extent_state *state,
1818 struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1819 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1820 u64 len = state->end + 1 - state->start;
1821 u32 num_extents = count_max_extents(len);
1823 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1824 spin_lock(&inode->lock);
1825 inode->defrag_bytes -= len;
1826 spin_unlock(&inode->lock);
1830 * set_bit and clear bit hooks normally require _irqsave/restore
1831 * but in this case, we are only testing for the DELALLOC
1832 * bit, which is only set or cleared with irqs on
1834 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1835 struct btrfs_root *root = inode->root;
1836 bool do_list = !btrfs_is_free_space_inode(inode);
1838 spin_lock(&inode->lock);
1839 btrfs_mod_outstanding_extents(inode, -num_extents);
1840 spin_unlock(&inode->lock);
1843 * We don't reserve metadata space for space cache inodes so we
1844 * don't need to call dellalloc_release_metadata if there is an
1847 if (*bits & EXTENT_CLEAR_META_RESV &&
1848 root != fs_info->tree_root)
1849 btrfs_delalloc_release_metadata(inode, len, false);
1851 /* For sanity tests. */
1852 if (btrfs_is_testing(fs_info))
1855 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1856 do_list && !(state->state & EXTENT_NORESERVE) &&
1857 (*bits & EXTENT_CLEAR_DATA_RESV))
1858 btrfs_free_reserved_data_space_noquota(
1862 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1863 fs_info->delalloc_batch);
1864 spin_lock(&inode->lock);
1865 inode->delalloc_bytes -= len;
1866 if (do_list && inode->delalloc_bytes == 0 &&
1867 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1868 &inode->runtime_flags))
1869 btrfs_del_delalloc_inode(root, inode);
1870 spin_unlock(&inode->lock);
1873 if ((state->state & EXTENT_DELALLOC_NEW) &&
1874 (*bits & EXTENT_DELALLOC_NEW)) {
1875 spin_lock(&inode->lock);
1876 ASSERT(inode->new_delalloc_bytes >= len);
1877 inode->new_delalloc_bytes -= len;
1878 spin_unlock(&inode->lock);
1883 * Merge bio hook, this must check the chunk tree to make sure we don't create
1884 * bios that span stripes or chunks
1886 * return 1 if page cannot be merged to bio
1887 * return 0 if page can be merged to bio
1888 * return error otherwise
1890 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1891 size_t size, struct bio *bio,
1892 unsigned long bio_flags)
1894 struct inode *inode = page->mapping->host;
1895 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1896 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1901 if (bio_flags & EXTENT_BIO_COMPRESSED)
1904 length = bio->bi_iter.bi_size;
1905 map_length = length;
1906 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1910 if (map_length < length + size)
1916 * in order to insert checksums into the metadata in large chunks,
1917 * we wait until bio submission time. All the pages in the bio are
1918 * checksummed and sums are attached onto the ordered extent record.
1920 * At IO completion time the cums attached on the ordered extent record
1921 * are inserted into the btree
1923 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1926 struct inode *inode = private_data;
1927 blk_status_t ret = 0;
1929 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1930 BUG_ON(ret); /* -ENOMEM */
1935 * in order to insert checksums into the metadata in large chunks,
1936 * we wait until bio submission time. All the pages in the bio are
1937 * checksummed and sums are attached onto the ordered extent record.
1939 * At IO completion time the cums attached on the ordered extent record
1940 * are inserted into the btree
1942 blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
1945 struct inode *inode = private_data;
1946 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1949 ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1951 bio->bi_status = ret;
1958 * extent_io.c submission hook. This does the right thing for csum calculation
1959 * on write, or reading the csums from the tree before a read.
1961 * Rules about async/sync submit,
1962 * a) read: sync submit
1964 * b) write without checksum: sync submit
1966 * c) write with checksum:
1967 * c-1) if bio is issued by fsync: sync submit
1968 * (sync_writers != 0)
1970 * c-2) if root is reloc root: sync submit
1971 * (only in case of buffered IO)
1973 * c-3) otherwise: async submit
1975 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1976 int mirror_num, unsigned long bio_flags,
1979 struct inode *inode = private_data;
1980 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1981 struct btrfs_root *root = BTRFS_I(inode)->root;
1982 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1983 blk_status_t ret = 0;
1985 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1987 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1989 if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1990 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1992 if (bio_op(bio) != REQ_OP_WRITE) {
1993 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1997 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1998 ret = btrfs_submit_compressed_read(inode, bio,
2002 } else if (!skip_sum) {
2003 ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2008 } else if (async && !skip_sum) {
2009 /* csum items have already been cloned */
2010 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2012 /* we're doing a write, do the async checksumming */
2013 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2015 btrfs_submit_bio_start);
2017 } else if (!skip_sum) {
2018 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2024 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2028 bio->bi_status = ret;
2035 * given a list of ordered sums record them in the inode. This happens
2036 * at IO completion time based on sums calculated at bio submission time.
2038 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2039 struct inode *inode, struct list_head *list)
2041 struct btrfs_ordered_sum *sum;
2044 list_for_each_entry(sum, list, list) {
2045 trans->adding_csums = true;
2046 ret = btrfs_csum_file_blocks(trans,
2047 BTRFS_I(inode)->root->fs_info->csum_root, sum);
2048 trans->adding_csums = false;
2055 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2056 unsigned int extra_bits,
2057 struct extent_state **cached_state, int dedupe)
2059 WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2060 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2061 extra_bits, cached_state);
2064 /* see btrfs_writepage_start_hook for details on why this is required */
2065 struct btrfs_writepage_fixup {
2067 struct btrfs_work work;
2070 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2072 struct btrfs_writepage_fixup *fixup;
2073 struct btrfs_ordered_extent *ordered;
2074 struct extent_state *cached_state = NULL;
2075 struct extent_changeset *data_reserved = NULL;
2077 struct inode *inode;
2082 fixup = container_of(work, struct btrfs_writepage_fixup, work);
2086 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2087 ClearPageChecked(page);
2091 inode = page->mapping->host;
2092 page_start = page_offset(page);
2093 page_end = page_offset(page) + PAGE_SIZE - 1;
2095 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2098 /* already ordered? We're done */
2099 if (PagePrivate2(page))
2102 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2105 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2106 page_end, &cached_state);
2108 btrfs_start_ordered_extent(inode, ordered, 1);
2109 btrfs_put_ordered_extent(ordered);
2113 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2116 mapping_set_error(page->mapping, ret);
2117 end_extent_writepage(page, ret, page_start, page_end);
2118 ClearPageChecked(page);
2122 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2125 mapping_set_error(page->mapping, ret);
2126 end_extent_writepage(page, ret, page_start, page_end);
2127 ClearPageChecked(page);
2131 ClearPageChecked(page);
2132 set_page_dirty(page);
2133 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2135 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2141 extent_changeset_free(data_reserved);
2145 * There are a few paths in the higher layers of the kernel that directly
2146 * set the page dirty bit without asking the filesystem if it is a
2147 * good idea. This causes problems because we want to make sure COW
2148 * properly happens and the data=ordered rules are followed.
2150 * In our case any range that doesn't have the ORDERED bit set
2151 * hasn't been properly setup for IO. We kick off an async process
2152 * to fix it up. The async helper will wait for ordered extents, set
2153 * the delalloc bit and make it safe to write the page.
2155 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2157 struct inode *inode = page->mapping->host;
2158 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2159 struct btrfs_writepage_fixup *fixup;
2161 /* this page is properly in the ordered list */
2162 if (TestClearPagePrivate2(page))
2165 if (PageChecked(page))
2168 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2172 SetPageChecked(page);
2174 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2175 btrfs_writepage_fixup_worker, NULL, NULL);
2177 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2181 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2182 struct inode *inode, u64 file_pos,
2183 u64 disk_bytenr, u64 disk_num_bytes,
2184 u64 num_bytes, u64 ram_bytes,
2185 u8 compression, u8 encryption,
2186 u16 other_encoding, int extent_type)
2188 struct btrfs_root *root = BTRFS_I(inode)->root;
2189 struct btrfs_file_extent_item *fi;
2190 struct btrfs_path *path;
2191 struct extent_buffer *leaf;
2192 struct btrfs_key ins;
2194 int extent_inserted = 0;
2197 path = btrfs_alloc_path();
2202 * we may be replacing one extent in the tree with another.
2203 * The new extent is pinned in the extent map, and we don't want
2204 * to drop it from the cache until it is completely in the btree.
2206 * So, tell btrfs_drop_extents to leave this extent in the cache.
2207 * the caller is expected to unpin it and allow it to be merged
2210 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2211 file_pos + num_bytes, NULL, 0,
2212 1, sizeof(*fi), &extent_inserted);
2216 if (!extent_inserted) {
2217 ins.objectid = btrfs_ino(BTRFS_I(inode));
2218 ins.offset = file_pos;
2219 ins.type = BTRFS_EXTENT_DATA_KEY;
2221 path->leave_spinning = 1;
2222 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2227 leaf = path->nodes[0];
2228 fi = btrfs_item_ptr(leaf, path->slots[0],
2229 struct btrfs_file_extent_item);
2230 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2231 btrfs_set_file_extent_type(leaf, fi, extent_type);
2232 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2233 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2234 btrfs_set_file_extent_offset(leaf, fi, 0);
2235 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2236 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2237 btrfs_set_file_extent_compression(leaf, fi, compression);
2238 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2239 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2241 btrfs_mark_buffer_dirty(leaf);
2242 btrfs_release_path(path);
2244 inode_add_bytes(inode, num_bytes);
2246 ins.objectid = disk_bytenr;
2247 ins.offset = disk_num_bytes;
2248 ins.type = BTRFS_EXTENT_ITEM_KEY;
2251 * Release the reserved range from inode dirty range map, as it is
2252 * already moved into delayed_ref_head
2254 ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2258 ret = btrfs_alloc_reserved_file_extent(trans, root,
2259 btrfs_ino(BTRFS_I(inode)),
2260 file_pos, qg_released, &ins);
2262 btrfs_free_path(path);
2267 /* snapshot-aware defrag */
2268 struct sa_defrag_extent_backref {
2269 struct rb_node node;
2270 struct old_sa_defrag_extent *old;
2279 struct old_sa_defrag_extent {
2280 struct list_head list;
2281 struct new_sa_defrag_extent *new;
2290 struct new_sa_defrag_extent {
2291 struct rb_root root;
2292 struct list_head head;
2293 struct btrfs_path *path;
2294 struct inode *inode;
2302 static int backref_comp(struct sa_defrag_extent_backref *b1,
2303 struct sa_defrag_extent_backref *b2)
2305 if (b1->root_id < b2->root_id)
2307 else if (b1->root_id > b2->root_id)
2310 if (b1->inum < b2->inum)
2312 else if (b1->inum > b2->inum)
2315 if (b1->file_pos < b2->file_pos)
2317 else if (b1->file_pos > b2->file_pos)
2321 * [------------------------------] ===> (a range of space)
2322 * |<--->| |<---->| =============> (fs/file tree A)
2323 * |<---------------------------->| ===> (fs/file tree B)
2325 * A range of space can refer to two file extents in one tree while
2326 * refer to only one file extent in another tree.
2328 * So we may process a disk offset more than one time(two extents in A)
2329 * and locate at the same extent(one extent in B), then insert two same
2330 * backrefs(both refer to the extent in B).
2335 static void backref_insert(struct rb_root *root,
2336 struct sa_defrag_extent_backref *backref)
2338 struct rb_node **p = &root->rb_node;
2339 struct rb_node *parent = NULL;
2340 struct sa_defrag_extent_backref *entry;
2345 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2347 ret = backref_comp(backref, entry);
2351 p = &(*p)->rb_right;
2354 rb_link_node(&backref->node, parent, p);
2355 rb_insert_color(&backref->node, root);
2359 * Note the backref might has changed, and in this case we just return 0.
2361 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2364 struct btrfs_file_extent_item *extent;
2365 struct old_sa_defrag_extent *old = ctx;
2366 struct new_sa_defrag_extent *new = old->new;
2367 struct btrfs_path *path = new->path;
2368 struct btrfs_key key;
2369 struct btrfs_root *root;
2370 struct sa_defrag_extent_backref *backref;
2371 struct extent_buffer *leaf;
2372 struct inode *inode = new->inode;
2373 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2379 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2380 inum == btrfs_ino(BTRFS_I(inode)))
2383 key.objectid = root_id;
2384 key.type = BTRFS_ROOT_ITEM_KEY;
2385 key.offset = (u64)-1;
2387 root = btrfs_read_fs_root_no_name(fs_info, &key);
2389 if (PTR_ERR(root) == -ENOENT)
2392 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2393 inum, offset, root_id);
2394 return PTR_ERR(root);
2397 key.objectid = inum;
2398 key.type = BTRFS_EXTENT_DATA_KEY;
2399 if (offset > (u64)-1 << 32)
2402 key.offset = offset;
2404 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2405 if (WARN_ON(ret < 0))
2412 leaf = path->nodes[0];
2413 slot = path->slots[0];
2415 if (slot >= btrfs_header_nritems(leaf)) {
2416 ret = btrfs_next_leaf(root, path);
2419 } else if (ret > 0) {
2428 btrfs_item_key_to_cpu(leaf, &key, slot);
2430 if (key.objectid > inum)
2433 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2436 extent = btrfs_item_ptr(leaf, slot,
2437 struct btrfs_file_extent_item);
2439 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2443 * 'offset' refers to the exact key.offset,
2444 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2445 * (key.offset - extent_offset).
2447 if (key.offset != offset)
2450 extent_offset = btrfs_file_extent_offset(leaf, extent);
2451 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2453 if (extent_offset >= old->extent_offset + old->offset +
2454 old->len || extent_offset + num_bytes <=
2455 old->extent_offset + old->offset)
2460 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2466 backref->root_id = root_id;
2467 backref->inum = inum;
2468 backref->file_pos = offset;
2469 backref->num_bytes = num_bytes;
2470 backref->extent_offset = extent_offset;
2471 backref->generation = btrfs_file_extent_generation(leaf, extent);
2473 backref_insert(&new->root, backref);
2476 btrfs_release_path(path);
2481 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2482 struct new_sa_defrag_extent *new)
2484 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2485 struct old_sa_defrag_extent *old, *tmp;
2490 list_for_each_entry_safe(old, tmp, &new->head, list) {
2491 ret = iterate_inodes_from_logical(old->bytenr +
2492 old->extent_offset, fs_info,
2493 path, record_one_backref,
2495 if (ret < 0 && ret != -ENOENT)
2498 /* no backref to be processed for this extent */
2500 list_del(&old->list);
2505 if (list_empty(&new->head))
2511 static int relink_is_mergable(struct extent_buffer *leaf,
2512 struct btrfs_file_extent_item *fi,
2513 struct new_sa_defrag_extent *new)
2515 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2518 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2521 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2524 if (btrfs_file_extent_encryption(leaf, fi) ||
2525 btrfs_file_extent_other_encoding(leaf, fi))
2532 * Note the backref might has changed, and in this case we just return 0.
2534 static noinline int relink_extent_backref(struct btrfs_path *path,
2535 struct sa_defrag_extent_backref *prev,
2536 struct sa_defrag_extent_backref *backref)
2538 struct btrfs_file_extent_item *extent;
2539 struct btrfs_file_extent_item *item;
2540 struct btrfs_ordered_extent *ordered;
2541 struct btrfs_trans_handle *trans;
2542 struct btrfs_root *root;
2543 struct btrfs_key key;
2544 struct extent_buffer *leaf;
2545 struct old_sa_defrag_extent *old = backref->old;
2546 struct new_sa_defrag_extent *new = old->new;
2547 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2548 struct inode *inode;
2549 struct extent_state *cached = NULL;
2558 if (prev && prev->root_id == backref->root_id &&
2559 prev->inum == backref->inum &&
2560 prev->file_pos + prev->num_bytes == backref->file_pos)
2563 /* step 1: get root */
2564 key.objectid = backref->root_id;
2565 key.type = BTRFS_ROOT_ITEM_KEY;
2566 key.offset = (u64)-1;
2568 index = srcu_read_lock(&fs_info->subvol_srcu);
2570 root = btrfs_read_fs_root_no_name(fs_info, &key);
2572 srcu_read_unlock(&fs_info->subvol_srcu, index);
2573 if (PTR_ERR(root) == -ENOENT)
2575 return PTR_ERR(root);
2578 if (btrfs_root_readonly(root)) {
2579 srcu_read_unlock(&fs_info->subvol_srcu, index);
2583 /* step 2: get inode */
2584 key.objectid = backref->inum;
2585 key.type = BTRFS_INODE_ITEM_KEY;
2588 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2589 if (IS_ERR(inode)) {
2590 srcu_read_unlock(&fs_info->subvol_srcu, index);
2594 srcu_read_unlock(&fs_info->subvol_srcu, index);
2596 /* step 3: relink backref */
2597 lock_start = backref->file_pos;
2598 lock_end = backref->file_pos + backref->num_bytes - 1;
2599 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2602 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2604 btrfs_put_ordered_extent(ordered);
2608 trans = btrfs_join_transaction(root);
2609 if (IS_ERR(trans)) {
2610 ret = PTR_ERR(trans);
2614 key.objectid = backref->inum;
2615 key.type = BTRFS_EXTENT_DATA_KEY;
2616 key.offset = backref->file_pos;
2618 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2621 } else if (ret > 0) {
2626 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2627 struct btrfs_file_extent_item);
2629 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2630 backref->generation)
2633 btrfs_release_path(path);
2635 start = backref->file_pos;
2636 if (backref->extent_offset < old->extent_offset + old->offset)
2637 start += old->extent_offset + old->offset -
2638 backref->extent_offset;
2640 len = min(backref->extent_offset + backref->num_bytes,
2641 old->extent_offset + old->offset + old->len);
2642 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2644 ret = btrfs_drop_extents(trans, root, inode, start,
2649 key.objectid = btrfs_ino(BTRFS_I(inode));
2650 key.type = BTRFS_EXTENT_DATA_KEY;
2653 path->leave_spinning = 1;
2655 struct btrfs_file_extent_item *fi;
2657 struct btrfs_key found_key;
2659 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2664 leaf = path->nodes[0];
2665 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2667 fi = btrfs_item_ptr(leaf, path->slots[0],
2668 struct btrfs_file_extent_item);
2669 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2671 if (extent_len + found_key.offset == start &&
2672 relink_is_mergable(leaf, fi, new)) {
2673 btrfs_set_file_extent_num_bytes(leaf, fi,
2675 btrfs_mark_buffer_dirty(leaf);
2676 inode_add_bytes(inode, len);
2682 btrfs_release_path(path);
2687 ret = btrfs_insert_empty_item(trans, root, path, &key,
2690 btrfs_abort_transaction(trans, ret);
2694 leaf = path->nodes[0];
2695 item = btrfs_item_ptr(leaf, path->slots[0],
2696 struct btrfs_file_extent_item);
2697 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2698 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2699 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2700 btrfs_set_file_extent_num_bytes(leaf, item, len);
2701 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2702 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2703 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2704 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2705 btrfs_set_file_extent_encryption(leaf, item, 0);
2706 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2708 btrfs_mark_buffer_dirty(leaf);
2709 inode_add_bytes(inode, len);
2710 btrfs_release_path(path);
2712 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2714 backref->root_id, backref->inum,
2715 new->file_pos); /* start - extent_offset */
2717 btrfs_abort_transaction(trans, ret);
2723 btrfs_release_path(path);
2724 path->leave_spinning = 0;
2725 btrfs_end_transaction(trans);
2727 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2733 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2735 struct old_sa_defrag_extent *old, *tmp;
2740 list_for_each_entry_safe(old, tmp, &new->head, list) {
2746 static void relink_file_extents(struct new_sa_defrag_extent *new)
2748 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2749 struct btrfs_path *path;
2750 struct sa_defrag_extent_backref *backref;
2751 struct sa_defrag_extent_backref *prev = NULL;
2752 struct rb_node *node;
2755 path = btrfs_alloc_path();
2759 if (!record_extent_backrefs(path, new)) {
2760 btrfs_free_path(path);
2763 btrfs_release_path(path);
2766 node = rb_first(&new->root);
2769 rb_erase(node, &new->root);
2771 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2773 ret = relink_extent_backref(path, prev, backref);
2786 btrfs_free_path(path);
2788 free_sa_defrag_extent(new);
2790 atomic_dec(&fs_info->defrag_running);
2791 wake_up(&fs_info->transaction_wait);
2794 static struct new_sa_defrag_extent *
2795 record_old_file_extents(struct inode *inode,
2796 struct btrfs_ordered_extent *ordered)
2798 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2799 struct btrfs_root *root = BTRFS_I(inode)->root;
2800 struct btrfs_path *path;
2801 struct btrfs_key key;
2802 struct old_sa_defrag_extent *old;
2803 struct new_sa_defrag_extent *new;
2806 new = kmalloc(sizeof(*new), GFP_NOFS);
2811 new->file_pos = ordered->file_offset;
2812 new->len = ordered->len;
2813 new->bytenr = ordered->start;
2814 new->disk_len = ordered->disk_len;
2815 new->compress_type = ordered->compress_type;
2816 new->root = RB_ROOT;
2817 INIT_LIST_HEAD(&new->head);
2819 path = btrfs_alloc_path();
2823 key.objectid = btrfs_ino(BTRFS_I(inode));
2824 key.type = BTRFS_EXTENT_DATA_KEY;
2825 key.offset = new->file_pos;
2827 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2830 if (ret > 0 && path->slots[0] > 0)
2833 /* find out all the old extents for the file range */
2835 struct btrfs_file_extent_item *extent;
2836 struct extent_buffer *l;
2845 slot = path->slots[0];
2847 if (slot >= btrfs_header_nritems(l)) {
2848 ret = btrfs_next_leaf(root, path);
2856 btrfs_item_key_to_cpu(l, &key, slot);
2858 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2860 if (key.type != BTRFS_EXTENT_DATA_KEY)
2862 if (key.offset >= new->file_pos + new->len)
2865 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2867 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2868 if (key.offset + num_bytes < new->file_pos)
2871 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2875 extent_offset = btrfs_file_extent_offset(l, extent);
2877 old = kmalloc(sizeof(*old), GFP_NOFS);
2881 offset = max(new->file_pos, key.offset);
2882 end = min(new->file_pos + new->len, key.offset + num_bytes);
2884 old->bytenr = disk_bytenr;
2885 old->extent_offset = extent_offset;
2886 old->offset = offset - key.offset;
2887 old->len = end - offset;
2890 list_add_tail(&old->list, &new->head);
2896 btrfs_free_path(path);
2897 atomic_inc(&fs_info->defrag_running);
2902 btrfs_free_path(path);
2904 free_sa_defrag_extent(new);
2908 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2911 struct btrfs_block_group_cache *cache;
2913 cache = btrfs_lookup_block_group(fs_info, start);
2916 spin_lock(&cache->lock);
2917 cache->delalloc_bytes -= len;
2918 spin_unlock(&cache->lock);
2920 btrfs_put_block_group(cache);
2923 /* as ordered data IO finishes, this gets called so we can finish
2924 * an ordered extent if the range of bytes in the file it covers are
2927 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2929 struct inode *inode = ordered_extent->inode;
2930 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2931 struct btrfs_root *root = BTRFS_I(inode)->root;
2932 struct btrfs_trans_handle *trans = NULL;
2933 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2934 struct extent_state *cached_state = NULL;
2935 struct new_sa_defrag_extent *new = NULL;
2936 int compress_type = 0;
2938 u64 logical_len = ordered_extent->len;
2940 bool truncated = false;
2941 bool range_locked = false;
2942 bool clear_new_delalloc_bytes = false;
2944 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2945 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2946 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2947 clear_new_delalloc_bytes = true;
2949 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2951 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2956 btrfs_free_io_failure_record(BTRFS_I(inode),
2957 ordered_extent->file_offset,
2958 ordered_extent->file_offset +
2959 ordered_extent->len - 1);
2961 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2963 logical_len = ordered_extent->truncated_len;
2964 /* Truncated the entire extent, don't bother adding */
2969 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2970 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2973 * For mwrite(mmap + memset to write) case, we still reserve
2974 * space for NOCOW range.
2975 * As NOCOW won't cause a new delayed ref, just free the space
2977 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2978 ordered_extent->len);
2979 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2981 trans = btrfs_join_transaction_nolock(root);
2983 trans = btrfs_join_transaction(root);
2984 if (IS_ERR(trans)) {
2985 ret = PTR_ERR(trans);
2989 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2990 ret = btrfs_update_inode_fallback(trans, root, inode);
2991 if (ret) /* -ENOMEM or corruption */
2992 btrfs_abort_transaction(trans, ret);
2996 range_locked = true;
2997 lock_extent_bits(io_tree, ordered_extent->file_offset,
2998 ordered_extent->file_offset + ordered_extent->len - 1,
3001 ret = test_range_bit(io_tree, ordered_extent->file_offset,
3002 ordered_extent->file_offset + ordered_extent->len - 1,
3003 EXTENT_DEFRAG, 0, cached_state);
3005 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3006 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3007 /* the inode is shared */
3008 new = record_old_file_extents(inode, ordered_extent);
3010 clear_extent_bit(io_tree, ordered_extent->file_offset,
3011 ordered_extent->file_offset + ordered_extent->len - 1,
3012 EXTENT_DEFRAG, 0, 0, &cached_state);
3016 trans = btrfs_join_transaction_nolock(root);
3018 trans = btrfs_join_transaction(root);
3019 if (IS_ERR(trans)) {
3020 ret = PTR_ERR(trans);
3025 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3027 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3028 compress_type = ordered_extent->compress_type;
3029 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3030 BUG_ON(compress_type);
3031 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3032 ordered_extent->len);
3033 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3034 ordered_extent->file_offset,
3035 ordered_extent->file_offset +
3038 BUG_ON(root == fs_info->tree_root);
3039 ret = insert_reserved_file_extent(trans, inode,
3040 ordered_extent->file_offset,
3041 ordered_extent->start,
3042 ordered_extent->disk_len,
3043 logical_len, logical_len,
3044 compress_type, 0, 0,
3045 BTRFS_FILE_EXTENT_REG);
3047 btrfs_release_delalloc_bytes(fs_info,
3048 ordered_extent->start,
3049 ordered_extent->disk_len);
3051 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3052 ordered_extent->file_offset, ordered_extent->len,
3055 btrfs_abort_transaction(trans, ret);
3059 ret = add_pending_csums(trans, inode, &ordered_extent->list);
3061 btrfs_abort_transaction(trans, ret);
3065 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3066 ret = btrfs_update_inode_fallback(trans, root, inode);
3067 if (ret) { /* -ENOMEM or corruption */
3068 btrfs_abort_transaction(trans, ret);
3073 if (range_locked || clear_new_delalloc_bytes) {
3074 unsigned int clear_bits = 0;
3077 clear_bits |= EXTENT_LOCKED;
3078 if (clear_new_delalloc_bytes)
3079 clear_bits |= EXTENT_DELALLOC_NEW;
3080 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3081 ordered_extent->file_offset,
3082 ordered_extent->file_offset +
3083 ordered_extent->len - 1,
3085 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3090 btrfs_end_transaction(trans);
3092 if (ret || truncated) {
3096 start = ordered_extent->file_offset + logical_len;
3098 start = ordered_extent->file_offset;
3099 end = ordered_extent->file_offset + ordered_extent->len - 1;
3100 clear_extent_uptodate(io_tree, start, end, NULL);
3102 /* Drop the cache for the part of the extent we didn't write. */
3103 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3106 * If the ordered extent had an IOERR or something else went
3107 * wrong we need to return the space for this ordered extent
3108 * back to the allocator. We only free the extent in the
3109 * truncated case if we didn't write out the extent at all.
3111 if ((ret || !logical_len) &&
3112 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3113 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3114 btrfs_free_reserved_extent(fs_info,
3115 ordered_extent->start,
3116 ordered_extent->disk_len, 1);
3121 * This needs to be done to make sure anybody waiting knows we are done
3122 * updating everything for this ordered extent.
3124 btrfs_remove_ordered_extent(inode, ordered_extent);
3126 /* for snapshot-aware defrag */
3129 free_sa_defrag_extent(new);
3130 atomic_dec(&fs_info->defrag_running);
3132 relink_file_extents(new);
3137 btrfs_put_ordered_extent(ordered_extent);
3138 /* once for the tree */
3139 btrfs_put_ordered_extent(ordered_extent);
3141 /* Try to release some metadata so we don't get an OOM but don't wait */
3142 btrfs_btree_balance_dirty_nodelay(fs_info);
3147 static void finish_ordered_fn(struct btrfs_work *work)
3149 struct btrfs_ordered_extent *ordered_extent;
3150 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3151 btrfs_finish_ordered_io(ordered_extent);
3154 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3155 struct extent_state *state, int uptodate)
3157 struct inode *inode = page->mapping->host;
3158 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3159 struct btrfs_ordered_extent *ordered_extent = NULL;
3160 struct btrfs_workqueue *wq;
3161 btrfs_work_func_t func;
3163 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3165 ClearPagePrivate2(page);
3166 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3167 end - start + 1, uptodate))
3170 if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3171 wq = fs_info->endio_freespace_worker;
3172 func = btrfs_freespace_write_helper;
3174 wq = fs_info->endio_write_workers;
3175 func = btrfs_endio_write_helper;
3178 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3180 btrfs_queue_work(wq, &ordered_extent->work);
3183 static int __readpage_endio_check(struct inode *inode,
3184 struct btrfs_io_bio *io_bio,
3185 int icsum, struct page *page,
3186 int pgoff, u64 start, size_t len)
3192 csum_expected = *(((u32 *)io_bio->csum) + icsum);
3194 kaddr = kmap_atomic(page);
3195 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3196 btrfs_csum_final(csum, (u8 *)&csum);
3197 if (csum != csum_expected)
3200 kunmap_atomic(kaddr);
3203 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3204 io_bio->mirror_num);
3205 memset(kaddr + pgoff, 1, len);
3206 flush_dcache_page(page);
3207 kunmap_atomic(kaddr);
3212 * when reads are done, we need to check csums to verify the data is correct
3213 * if there's a match, we allow the bio to finish. If not, the code in
3214 * extent_io.c will try to find good copies for us.
3216 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3217 u64 phy_offset, struct page *page,
3218 u64 start, u64 end, int mirror)
3220 size_t offset = start - page_offset(page);
3221 struct inode *inode = page->mapping->host;
3222 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3223 struct btrfs_root *root = BTRFS_I(inode)->root;
3225 if (PageChecked(page)) {
3226 ClearPageChecked(page);