1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/kernel.h>
8 #include <linux/buffer_head.h>
9 #include <linux/file.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/mpage.h>
18 #include <linux/swap.h>
19 #include <linux/writeback.h>
20 #include <linux/compat.h>
21 #include <linux/bit_spinlock.h>
22 #include <linux/xattr.h>
23 #include <linux/posix_acl.h>
24 #include <linux/falloc.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/mount.h>
28 #include <linux/btrfs.h>
29 #include <linux/blkdev.h>
30 #include <linux/posix_acl_xattr.h>
31 #include <linux/uio.h>
32 #include <linux/magic.h>
33 #include <linux/iversion.h>
36 #include "transaction.h"
37 #include "btrfs_inode.h"
38 #include "print-tree.h"
39 #include "ordered-data.h"
43 #include "compression.h"
45 #include "free-space-cache.h"
46 #include "inode-map.h"
52 struct btrfs_iget_args {
53 struct btrfs_key *location;
54 struct btrfs_root *root;
57 struct btrfs_dio_data {
59 u64 unsubmitted_oe_range_start;
60 u64 unsubmitted_oe_range_end;
64 static const struct inode_operations btrfs_dir_inode_operations;
65 static const struct inode_operations btrfs_symlink_inode_operations;
66 static const struct inode_operations btrfs_dir_ro_inode_operations;
67 static const struct inode_operations btrfs_special_inode_operations;
68 static const struct inode_operations btrfs_file_inode_operations;
69 static const struct address_space_operations btrfs_aops;
70 static const struct address_space_operations btrfs_symlink_aops;
71 static const struct file_operations btrfs_dir_file_operations;
72 static const struct extent_io_ops btrfs_extent_io_ops;
74 static struct kmem_cache *btrfs_inode_cachep;
75 struct kmem_cache *btrfs_trans_handle_cachep;
76 struct kmem_cache *btrfs_path_cachep;
77 struct kmem_cache *btrfs_free_space_cachep;
80 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
81 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
82 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
83 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
84 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
85 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
86 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
90 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
91 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
92 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93 static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page,
95 u64 start, u64 end, u64 delalloc_end,
96 int *page_started, unsigned long *nr_written,
97 int unlock, struct btrfs_dedupe_hash *hash);
98 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
99 u64 orig_start, u64 block_start,
100 u64 block_len, u64 orig_block_len,
101 u64 ram_bytes, int compress_type,
104 static void __endio_write_update_ordered(struct inode *inode,
105 const u64 offset, const u64 bytes,
106 const bool uptodate);
109 * Cleanup all submitted ordered extents in specified range to handle errors
110 * from the fill_dellaloc() callback.
112 * NOTE: caller must ensure that when an error happens, it can not call
113 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
114 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
115 * to be released, which we want to happen only when finishing the ordered
116 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
117 * fill_delalloc() callback already does proper cleanup for the first page of
118 * the range, that is, it invokes the callback writepage_end_io_hook() for the
119 * range of the first page.
121 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
125 unsigned long index = offset >> PAGE_SHIFT;
126 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
129 while (index <= end_index) {
130 page = find_get_page(inode->i_mapping, index);
134 ClearPagePrivate2(page);
137 return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
138 bytes - PAGE_SIZE, false);
141 static int btrfs_dirty_inode(struct inode *inode);
143 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
144 void btrfs_test_inode_set_ops(struct inode *inode)
146 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
150 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
151 struct inode *inode, struct inode *dir,
152 const struct qstr *qstr)
156 err = btrfs_init_acl(trans, inode, dir);
158 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
163 * this does all the hard work for inserting an inline extent into
164 * the btree. The caller should have done a btrfs_drop_extents so that
165 * no overlapping inline items exist in the btree
167 static int insert_inline_extent(struct btrfs_trans_handle *trans,
168 struct btrfs_path *path, int extent_inserted,
169 struct btrfs_root *root, struct inode *inode,
170 u64 start, size_t size, size_t compressed_size,
172 struct page **compressed_pages)
174 struct extent_buffer *leaf;
175 struct page *page = NULL;
178 struct btrfs_file_extent_item *ei;
180 size_t cur_size = size;
181 unsigned long offset;
183 if (compressed_size && compressed_pages)
184 cur_size = compressed_size;
186 inode_add_bytes(inode, size);
188 if (!extent_inserted) {
189 struct btrfs_key key;
192 key.objectid = btrfs_ino(BTRFS_I(inode));
194 key.type = BTRFS_EXTENT_DATA_KEY;
196 datasize = btrfs_file_extent_calc_inline_size(cur_size);
197 path->leave_spinning = 1;
198 ret = btrfs_insert_empty_item(trans, root, path, &key,
203 leaf = path->nodes[0];
204 ei = btrfs_item_ptr(leaf, path->slots[0],
205 struct btrfs_file_extent_item);
206 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
207 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
208 btrfs_set_file_extent_encryption(leaf, ei, 0);
209 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
210 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
211 ptr = btrfs_file_extent_inline_start(ei);
213 if (compress_type != BTRFS_COMPRESS_NONE) {
216 while (compressed_size > 0) {
217 cpage = compressed_pages[i];
218 cur_size = min_t(unsigned long, compressed_size,
221 kaddr = kmap_atomic(cpage);
222 write_extent_buffer(leaf, kaddr, ptr, cur_size);
223 kunmap_atomic(kaddr);
227 compressed_size -= cur_size;
229 btrfs_set_file_extent_compression(leaf, ei,
232 page = find_get_page(inode->i_mapping,
233 start >> PAGE_SHIFT);
234 btrfs_set_file_extent_compression(leaf, ei, 0);
235 kaddr = kmap_atomic(page);
236 offset = start & (PAGE_SIZE - 1);
237 write_extent_buffer(leaf, kaddr + offset, ptr, size);
238 kunmap_atomic(kaddr);
241 btrfs_mark_buffer_dirty(leaf);
242 btrfs_release_path(path);
245 * we're an inline extent, so nobody can
246 * extend the file past i_size without locking
247 * a page we already have locked.
249 * We must do any isize and inode updates
250 * before we unlock the pages. Otherwise we
251 * could end up racing with unlink.
253 BTRFS_I(inode)->disk_i_size = inode->i_size;
254 ret = btrfs_update_inode(trans, root, inode);
262 * conditionally insert an inline extent into the file. This
263 * does the checks required to make sure the data is small enough
264 * to fit as an inline extent.
266 static noinline int cow_file_range_inline(struct inode *inode, u64 start,
267 u64 end, size_t compressed_size,
269 struct page **compressed_pages)
271 struct btrfs_root *root = BTRFS_I(inode)->root;
272 struct btrfs_fs_info *fs_info = root->fs_info;
273 struct btrfs_trans_handle *trans;
274 u64 isize = i_size_read(inode);
275 u64 actual_end = min(end + 1, isize);
276 u64 inline_len = actual_end - start;
277 u64 aligned_end = ALIGN(end, fs_info->sectorsize);
278 u64 data_len = inline_len;
280 struct btrfs_path *path;
281 int extent_inserted = 0;
282 u32 extent_item_size;
285 data_len = compressed_size;
288 actual_end > fs_info->sectorsize ||
289 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
291 (actual_end & (fs_info->sectorsize - 1)) == 0) ||
293 data_len > fs_info->max_inline) {
297 path = btrfs_alloc_path();
301 trans = btrfs_join_transaction(root);
303 btrfs_free_path(path);
304 return PTR_ERR(trans);
306 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
308 if (compressed_size && compressed_pages)
309 extent_item_size = btrfs_file_extent_calc_inline_size(
312 extent_item_size = btrfs_file_extent_calc_inline_size(
315 ret = __btrfs_drop_extents(trans, root, inode, path,
316 start, aligned_end, NULL,
317 1, 1, extent_item_size, &extent_inserted);
319 btrfs_abort_transaction(trans, ret);
323 if (isize > actual_end)
324 inline_len = min_t(u64, isize, actual_end);
325 ret = insert_inline_extent(trans, path, extent_inserted,
327 inline_len, compressed_size,
328 compress_type, compressed_pages);
329 if (ret && ret != -ENOSPC) {
330 btrfs_abort_transaction(trans, ret);
332 } else if (ret == -ENOSPC) {
337 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
338 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
341 * Don't forget to free the reserved space, as for inlined extent
342 * it won't count as data extent, free them directly here.
343 * And at reserve time, it's always aligned to page size, so
344 * just free one page here.
346 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
347 btrfs_free_path(path);
348 btrfs_end_transaction(trans);
352 struct async_extent {
357 unsigned long nr_pages;
359 struct list_head list;
364 struct btrfs_root *root;
365 struct page *locked_page;
368 unsigned int write_flags;
369 struct list_head extents;
370 struct btrfs_work work;
373 static noinline int add_async_extent(struct async_cow *cow,
374 u64 start, u64 ram_size,
377 unsigned long nr_pages,
380 struct async_extent *async_extent;
382 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
383 BUG_ON(!async_extent); /* -ENOMEM */
384 async_extent->start = start;
385 async_extent->ram_size = ram_size;
386 async_extent->compressed_size = compressed_size;
387 async_extent->pages = pages;
388 async_extent->nr_pages = nr_pages;
389 async_extent->compress_type = compress_type;
390 list_add_tail(&async_extent->list, &cow->extents);
394 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
396 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
399 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
402 if (BTRFS_I(inode)->defrag_compress)
404 /* bad compression ratios */
405 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
407 if (btrfs_test_opt(fs_info, COMPRESS) ||
408 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
409 BTRFS_I(inode)->prop_compress)
410 return btrfs_compress_heuristic(inode, start, end);
414 static inline void inode_should_defrag(struct btrfs_inode *inode,
415 u64 start, u64 end, u64 num_bytes, u64 small_write)
417 /* If this is a small write inside eof, kick off a defrag */
418 if (num_bytes < small_write &&
419 (start > 0 || end + 1 < inode->disk_i_size))
420 btrfs_add_inode_defrag(NULL, inode);
424 * we create compressed extents in two phases. The first
425 * phase compresses a range of pages that have already been
426 * locked (both pages and state bits are locked).
428 * This is done inside an ordered work queue, and the compression
429 * is spread across many cpus. The actual IO submission is step
430 * two, and the ordered work queue takes care of making sure that
431 * happens in the same order things were put onto the queue by
432 * writepages and friends.
434 * If this code finds it can't get good compression, it puts an
435 * entry onto the work queue to write the uncompressed bytes. This
436 * makes sure that both compressed inodes and uncompressed inodes
437 * are written in the same order that the flusher thread sent them
440 static noinline void compress_file_range(struct inode *inode,
441 struct page *locked_page,
443 struct async_cow *async_cow,
446 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
447 u64 blocksize = fs_info->sectorsize;
449 u64 isize = i_size_read(inode);
451 struct page **pages = NULL;
452 unsigned long nr_pages;
453 unsigned long total_compressed = 0;
454 unsigned long total_in = 0;
457 int compress_type = fs_info->compress_type;
460 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
463 actual_end = min_t(u64, isize, end + 1);
466 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
467 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
468 nr_pages = min_t(unsigned long, nr_pages,
469 BTRFS_MAX_COMPRESSED / PAGE_SIZE);
472 * we don't want to send crud past the end of i_size through
473 * compression, that's just a waste of CPU time. So, if the
474 * end of the file is before the start of our current
475 * requested range of bytes, we bail out to the uncompressed
476 * cleanup code that can deal with all of this.
478 * It isn't really the fastest way to fix things, but this is a
479 * very uncommon corner.
481 if (actual_end <= start)
482 goto cleanup_and_bail_uncompressed;
484 total_compressed = actual_end - start;
487 * skip compression for a small file range(<=blocksize) that
488 * isn't an inline extent, since it doesn't save disk space at all.
490 if (total_compressed <= blocksize &&
491 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
492 goto cleanup_and_bail_uncompressed;
494 total_compressed = min_t(unsigned long, total_compressed,
495 BTRFS_MAX_UNCOMPRESSED);
500 * we do compression for mount -o compress and when the
501 * inode has not been flagged as nocompress. This flag can
502 * change at any time if we discover bad compression ratios.
504 if (inode_need_compress(inode, start, end)) {
506 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
508 /* just bail out to the uncompressed code */
512 if (BTRFS_I(inode)->defrag_compress)
513 compress_type = BTRFS_I(inode)->defrag_compress;
514 else if (BTRFS_I(inode)->prop_compress)
515 compress_type = BTRFS_I(inode)->prop_compress;
518 * we need to call clear_page_dirty_for_io on each
519 * page in the range. Otherwise applications with the file
520 * mmap'd can wander in and change the page contents while
521 * we are compressing them.
523 * If the compression fails for any reason, we set the pages
524 * dirty again later on.
526 * Note that the remaining part is redirtied, the start pointer
527 * has moved, the end is the original one.
530 extent_range_clear_dirty_for_io(inode, start, end);
534 /* Compression level is applied here and only here */
535 ret = btrfs_compress_pages(
536 compress_type | (fs_info->compress_level << 4),
537 inode->i_mapping, start,
544 unsigned long offset = total_compressed &
546 struct page *page = pages[nr_pages - 1];
549 /* zero the tail end of the last page, we might be
550 * sending it down to disk
553 kaddr = kmap_atomic(page);
554 memset(kaddr + offset, 0,
556 kunmap_atomic(kaddr);
563 /* lets try to make an inline extent */
564 if (ret || total_in < actual_end) {
565 /* we didn't compress the entire range, try
566 * to make an uncompressed inline extent.
568 ret = cow_file_range_inline(inode, start, end, 0,
569 BTRFS_COMPRESS_NONE, NULL);
571 /* try making a compressed inline extent */
572 ret = cow_file_range_inline(inode, start, end,
574 compress_type, pages);
577 unsigned long clear_flags = EXTENT_DELALLOC |
578 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
579 EXTENT_DO_ACCOUNTING;
580 unsigned long page_error_op;
582 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
585 * inline extent creation worked or returned error,
586 * we don't need to create any more async work items.
587 * Unlock and free up our temp pages.
589 * We use DO_ACCOUNTING here because we need the
590 * delalloc_release_metadata to be done _after_ we drop
591 * our outstanding extent for clearing delalloc for this
594 extent_clear_unlock_delalloc(inode, start, end, end,
607 * we aren't doing an inline extent round the compressed size
608 * up to a block size boundary so the allocator does sane
611 total_compressed = ALIGN(total_compressed, blocksize);
614 * one last check to make sure the compression is really a
615 * win, compare the page count read with the blocks on disk,
616 * compression must free at least one sector size
618 total_in = ALIGN(total_in, PAGE_SIZE);
619 if (total_compressed + blocksize <= total_in) {
623 * The async work queues will take care of doing actual
624 * allocation on disk for these compressed pages, and
625 * will submit them to the elevator.
627 add_async_extent(async_cow, start, total_in,
628 total_compressed, pages, nr_pages,
631 if (start + total_in < end) {
642 * the compression code ran but failed to make things smaller,
643 * free any pages it allocated and our page pointer array
645 for (i = 0; i < nr_pages; i++) {
646 WARN_ON(pages[i]->mapping);
651 total_compressed = 0;
654 /* flag the file so we don't compress in the future */
655 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
656 !(BTRFS_I(inode)->prop_compress)) {
657 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
660 cleanup_and_bail_uncompressed:
662 * No compression, but we still need to write the pages in the file
663 * we've been given so far. redirty the locked page if it corresponds
664 * to our extent and set things up for the async work queue to run
665 * cow_file_range to do the normal delalloc dance.
667 if (page_offset(locked_page) >= start &&
668 page_offset(locked_page) <= end)
669 __set_page_dirty_nobuffers(locked_page);
670 /* unlocked later on in the async handlers */
673 extent_range_redirty_for_io(inode, start, end);
674 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
675 BTRFS_COMPRESS_NONE);
681 for (i = 0; i < nr_pages; i++) {
682 WARN_ON(pages[i]->mapping);
688 static void free_async_extent_pages(struct async_extent *async_extent)
692 if (!async_extent->pages)
695 for (i = 0; i < async_extent->nr_pages; i++) {
696 WARN_ON(async_extent->pages[i]->mapping);
697 put_page(async_extent->pages[i]);
699 kfree(async_extent->pages);
700 async_extent->nr_pages = 0;
701 async_extent->pages = NULL;
705 * phase two of compressed writeback. This is the ordered portion
706 * of the code, which only gets called in the order the work was
707 * queued. We walk all the async extents created by compress_file_range
708 * and send them down to the disk.
710 static noinline void submit_compressed_extents(struct inode *inode,
711 struct async_cow *async_cow)
713 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
714 struct async_extent *async_extent;
716 struct btrfs_key ins;
717 struct extent_map *em;
718 struct btrfs_root *root = BTRFS_I(inode)->root;
719 struct extent_io_tree *io_tree;
723 while (!list_empty(&async_cow->extents)) {
724 async_extent = list_entry(async_cow->extents.next,
725 struct async_extent, list);
726 list_del(&async_extent->list);
728 io_tree = &BTRFS_I(inode)->io_tree;
731 /* did the compression code fall back to uncompressed IO? */
732 if (!async_extent->pages) {
733 int page_started = 0;
734 unsigned long nr_written = 0;
736 lock_extent(io_tree, async_extent->start,
737 async_extent->start +
738 async_extent->ram_size - 1);
740 /* allocate blocks */
741 ret = cow_file_range(inode, async_cow->locked_page,
743 async_extent->start +
744 async_extent->ram_size - 1,
745 async_extent->start +
746 async_extent->ram_size - 1,
747 &page_started, &nr_written, 0,
753 * if page_started, cow_file_range inserted an
754 * inline extent and took care of all the unlocking
755 * and IO for us. Otherwise, we need to submit
756 * all those pages down to the drive.
758 if (!page_started && !ret)
759 extent_write_locked_range(inode,
761 async_extent->start +
762 async_extent->ram_size - 1,
765 unlock_page(async_cow->locked_page);
771 lock_extent(io_tree, async_extent->start,
772 async_extent->start + async_extent->ram_size - 1);
774 ret = btrfs_reserve_extent(root, async_extent->ram_size,
775 async_extent->compressed_size,
776 async_extent->compressed_size,
777 0, alloc_hint, &ins, 1, 1);
779 free_async_extent_pages(async_extent);
781 if (ret == -ENOSPC) {
782 unlock_extent(io_tree, async_extent->start,
783 async_extent->start +
784 async_extent->ram_size - 1);
787 * we need to redirty the pages if we decide to
788 * fallback to uncompressed IO, otherwise we
789 * will not submit these pages down to lower
792 extent_range_redirty_for_io(inode,
794 async_extent->start +
795 async_extent->ram_size - 1);
802 * here we're doing allocation and writeback of the
805 em = create_io_em(inode, async_extent->start,
806 async_extent->ram_size, /* len */
807 async_extent->start, /* orig_start */
808 ins.objectid, /* block_start */
809 ins.offset, /* block_len */
810 ins.offset, /* orig_block_len */
811 async_extent->ram_size, /* ram_bytes */
812 async_extent->compress_type,
813 BTRFS_ORDERED_COMPRESSED);
815 /* ret value is not necessary due to void function */
816 goto out_free_reserve;
819 ret = btrfs_add_ordered_extent_compress(inode,
822 async_extent->ram_size,
824 BTRFS_ORDERED_COMPRESSED,
825 async_extent->compress_type);
827 btrfs_drop_extent_cache(BTRFS_I(inode),
829 async_extent->start +
830 async_extent->ram_size - 1, 0);
831 goto out_free_reserve;
833 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
836 * clear dirty, set writeback and unlock the pages.
838 extent_clear_unlock_delalloc(inode, async_extent->start,
839 async_extent->start +
840 async_extent->ram_size - 1,
841 async_extent->start +
842 async_extent->ram_size - 1,
843 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
844 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
846 if (btrfs_submit_compressed_write(inode,
848 async_extent->ram_size,
850 ins.offset, async_extent->pages,
851 async_extent->nr_pages,
852 async_cow->write_flags)) {
853 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
854 struct page *p = async_extent->pages[0];
855 const u64 start = async_extent->start;
856 const u64 end = start + async_extent->ram_size - 1;
858 p->mapping = inode->i_mapping;
859 tree->ops->writepage_end_io_hook(p, start, end,
862 extent_clear_unlock_delalloc(inode, start, end, end,
866 free_async_extent_pages(async_extent);
868 alloc_hint = ins.objectid + ins.offset;
874 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
875 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
877 extent_clear_unlock_delalloc(inode, async_extent->start,
878 async_extent->start +
879 async_extent->ram_size - 1,
880 async_extent->start +
881 async_extent->ram_size - 1,
882 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
883 EXTENT_DELALLOC_NEW |
884 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
885 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
886 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
888 free_async_extent_pages(async_extent);
893 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
896 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
897 struct extent_map *em;
900 read_lock(&em_tree->lock);
901 em = search_extent_mapping(em_tree, start, num_bytes);
904 * if block start isn't an actual block number then find the
905 * first block in this inode and use that as a hint. If that
906 * block is also bogus then just don't worry about it.
908 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
910 em = search_extent_mapping(em_tree, 0, 0);
911 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
912 alloc_hint = em->block_start;
916 alloc_hint = em->block_start;
920 read_unlock(&em_tree->lock);
926 * when extent_io.c finds a delayed allocation range in the file,
927 * the call backs end up in this code. The basic idea is to
928 * allocate extents on disk for the range, and create ordered data structs
929 * in ram to track those extents.
931 * locked_page is the page that writepage had locked already. We use
932 * it to make sure we don't do extra locks or unlocks.
934 * *page_started is set to one if we unlock locked_page and do everything
935 * required to start IO on it. It may be clean and already done with
938 static noinline int cow_file_range(struct inode *inode,
939 struct page *locked_page,
940 u64 start, u64 end, u64 delalloc_end,
941 int *page_started, unsigned long *nr_written,
942 int unlock, struct btrfs_dedupe_hash *hash)
944 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
945 struct btrfs_root *root = BTRFS_I(inode)->root;
948 unsigned long ram_size;
949 u64 cur_alloc_size = 0;
950 u64 blocksize = fs_info->sectorsize;
951 struct btrfs_key ins;
952 struct extent_map *em;
954 unsigned long page_ops;
955 bool extent_reserved = false;
958 if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
964 num_bytes = ALIGN(end - start + 1, blocksize);
965 num_bytes = max(blocksize, num_bytes);
966 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
968 inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
971 /* lets try to make an inline extent */
972 ret = cow_file_range_inline(inode, start, end, 0,
973 BTRFS_COMPRESS_NONE, NULL);
976 * We use DO_ACCOUNTING here because we need the
977 * delalloc_release_metadata to be run _after_ we drop
978 * our outstanding extent for clearing delalloc for this
981 extent_clear_unlock_delalloc(inode, start, end,
983 EXTENT_LOCKED | EXTENT_DELALLOC |
984 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
985 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
986 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
988 *nr_written = *nr_written +
989 (end - start + PAGE_SIZE) / PAGE_SIZE;
992 } else if (ret < 0) {
997 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
998 btrfs_drop_extent_cache(BTRFS_I(inode), start,
999 start + num_bytes - 1, 0);
1001 while (num_bytes > 0) {
1002 cur_alloc_size = num_bytes;
1003 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1004 fs_info->sectorsize, 0, alloc_hint,
1008 cur_alloc_size = ins.offset;
1009 extent_reserved = true;
1011 ram_size = ins.offset;
1012 em = create_io_em(inode, start, ins.offset, /* len */
1013 start, /* orig_start */
1014 ins.objectid, /* block_start */
1015 ins.offset, /* block_len */
1016 ins.offset, /* orig_block_len */
1017 ram_size, /* ram_bytes */
1018 BTRFS_COMPRESS_NONE, /* compress_type */
1019 BTRFS_ORDERED_REGULAR /* type */);
1022 free_extent_map(em);
1024 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1025 ram_size, cur_alloc_size, 0);
1027 goto out_drop_extent_cache;
1029 if (root->root_key.objectid ==
1030 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1031 ret = btrfs_reloc_clone_csums(inode, start,
1034 * Only drop cache here, and process as normal.
1036 * We must not allow extent_clear_unlock_delalloc()
1037 * at out_unlock label to free meta of this ordered
1038 * extent, as its meta should be freed by
1039 * btrfs_finish_ordered_io().
1041 * So we must continue until @start is increased to
1042 * skip current ordered extent.
1045 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1046 start + ram_size - 1, 0);
1049 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1051 /* we're not doing compressed IO, don't unlock the first
1052 * page (which the caller expects to stay locked), don't
1053 * clear any dirty bits and don't set any writeback bits
1055 * Do set the Private2 bit so we know this page was properly
1056 * setup for writepage
1058 page_ops = unlock ? PAGE_UNLOCK : 0;
1059 page_ops |= PAGE_SET_PRIVATE2;
1061 extent_clear_unlock_delalloc(inode, start,
1062 start + ram_size - 1,
1063 delalloc_end, locked_page,
1064 EXTENT_LOCKED | EXTENT_DELALLOC,
1066 if (num_bytes < cur_alloc_size)
1069 num_bytes -= cur_alloc_size;
1070 alloc_hint = ins.objectid + ins.offset;
1071 start += cur_alloc_size;
1072 extent_reserved = false;
1075 * btrfs_reloc_clone_csums() error, since start is increased
1076 * extent_clear_unlock_delalloc() at out_unlock label won't
1077 * free metadata of current ordered extent, we're OK to exit.
1085 out_drop_extent_cache:
1086 btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1088 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1089 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1091 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1092 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1093 page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1096 * If we reserved an extent for our delalloc range (or a subrange) and
1097 * failed to create the respective ordered extent, then it means that
1098 * when we reserved the extent we decremented the extent's size from
1099 * the data space_info's bytes_may_use counter and incremented the
1100 * space_info's bytes_reserved counter by the same amount. We must make
1101 * sure extent_clear_unlock_delalloc() does not try to decrement again
1102 * the data space_info's bytes_may_use counter, therefore we do not pass
1103 * it the flag EXTENT_CLEAR_DATA_RESV.
1105 if (extent_reserved) {
1106 extent_clear_unlock_delalloc(inode, start,
1107 start + cur_alloc_size,
1108 start + cur_alloc_size,
1112 start += cur_alloc_size;
1116 extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1118 clear_bits | EXTENT_CLEAR_DATA_RESV,
1124 * work queue call back to started compression on a file and pages
1126 static noinline void async_cow_start(struct btrfs_work *work)
1128 struct async_cow *async_cow;
1130 async_cow = container_of(work, struct async_cow, work);
1132 compress_file_range(async_cow->inode, async_cow->locked_page,
1133 async_cow->start, async_cow->end, async_cow,
1135 if (num_added == 0) {
1136 btrfs_add_delayed_iput(async_cow->inode);
1137 async_cow->inode = NULL;
1142 * work queue call back to submit previously compressed pages
1144 static noinline void async_cow_submit(struct btrfs_work *work)
1146 struct btrfs_fs_info *fs_info;
1147 struct async_cow *async_cow;
1148 struct btrfs_root *root;
1149 unsigned long nr_pages;
1151 async_cow = container_of(work, struct async_cow, work);
1153 root = async_cow->root;
1154 fs_info = root->fs_info;
1155 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1159 * atomic_sub_return implies a barrier for waitqueue_active
1161 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1163 waitqueue_active(&fs_info->async_submit_wait))
1164 wake_up(&fs_info->async_submit_wait);
1166 if (async_cow->inode)
1167 submit_compressed_extents(async_cow->inode, async_cow);
1170 static noinline void async_cow_free(struct btrfs_work *work)
1172 struct async_cow *async_cow;
1173 async_cow = container_of(work, struct async_cow, work);
1174 if (async_cow->inode)
1175 btrfs_add_delayed_iput(async_cow->inode);
1179 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1180 u64 start, u64 end, int *page_started,
1181 unsigned long *nr_written,
1182 unsigned int write_flags)
1184 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1185 struct async_cow *async_cow;
1186 struct btrfs_root *root = BTRFS_I(inode)->root;
1187 unsigned long nr_pages;
1190 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1192 while (start < end) {
1193 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1194 BUG_ON(!async_cow); /* -ENOMEM */
1195 async_cow->inode = igrab(inode);
1196 async_cow->root = root;
1197 async_cow->locked_page = locked_page;
1198 async_cow->start = start;
1199 async_cow->write_flags = write_flags;
1201 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1202 !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1205 cur_end = min(end, start + SZ_512K - 1);
1207 async_cow->end = cur_end;
1208 INIT_LIST_HEAD(&async_cow->extents);
1210 btrfs_init_work(&async_cow->work,
1211 btrfs_delalloc_helper,
1212 async_cow_start, async_cow_submit,
1215 nr_pages = (cur_end - start + PAGE_SIZE) >>
1217 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1219 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1221 *nr_written += nr_pages;
1222 start = cur_end + 1;
1228 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1229 u64 bytenr, u64 num_bytes)
1232 struct btrfs_ordered_sum *sums;
1235 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1236 bytenr + num_bytes - 1, &list, 0);
1237 if (ret == 0 && list_empty(&list))
1240 while (!list_empty(&list)) {
1241 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1242 list_del(&sums->list);
1251 * when nowcow writeback call back. This checks for snapshots or COW copies
1252 * of the extents that exist in the file, and COWs the file as required.
1254 * If no cow copies or snapshots exist, we write directly to the existing
1257 static noinline int run_delalloc_nocow(struct inode *inode,
1258 struct page *locked_page,
1259 u64 start, u64 end, int *page_started, int force,
1260 unsigned long *nr_written)
1262 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1263 struct btrfs_root *root = BTRFS_I(inode)->root;
1264 struct extent_buffer *leaf;
1265 struct btrfs_path *path;
1266 struct btrfs_file_extent_item *fi;
1267 struct btrfs_key found_key;
1268 struct extent_map *em;
1283 u64 ino = btrfs_ino(BTRFS_I(inode));
1285 path = btrfs_alloc_path();
1287 extent_clear_unlock_delalloc(inode, start, end, end,
1289 EXTENT_LOCKED | EXTENT_DELALLOC |
1290 EXTENT_DO_ACCOUNTING |
1291 EXTENT_DEFRAG, PAGE_UNLOCK |
1293 PAGE_SET_WRITEBACK |
1294 PAGE_END_WRITEBACK);
1298 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1300 cow_start = (u64)-1;
1303 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1307 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1308 leaf = path->nodes[0];
1309 btrfs_item_key_to_cpu(leaf, &found_key,
1310 path->slots[0] - 1);
1311 if (found_key.objectid == ino &&
1312 found_key.type == BTRFS_EXTENT_DATA_KEY)
1317 leaf = path->nodes[0];
1318 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1319 ret = btrfs_next_leaf(root, path);
1321 if (cow_start != (u64)-1)
1322 cur_offset = cow_start;
1327 leaf = path->nodes[0];
1333 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1335 if (found_key.objectid > ino)
1337 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1338 found_key.type < BTRFS_EXTENT_DATA_KEY) {
1342 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1343 found_key.offset > end)
1346 if (found_key.offset > cur_offset) {
1347 extent_end = found_key.offset;
1352 fi = btrfs_item_ptr(leaf, path->slots[0],
1353 struct btrfs_file_extent_item);
1354 extent_type = btrfs_file_extent_type(leaf, fi);
1356 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1357 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1358 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1359 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1360 extent_offset = btrfs_file_extent_offset(leaf, fi);
1361 extent_end = found_key.offset +
1362 btrfs_file_extent_num_bytes(leaf, fi);
1364 btrfs_file_extent_disk_num_bytes(leaf, fi);
1365 if (extent_end <= start) {
1369 if (disk_bytenr == 0)
1371 if (btrfs_file_extent_compression(leaf, fi) ||
1372 btrfs_file_extent_encryption(leaf, fi) ||
1373 btrfs_file_extent_other_encoding(leaf, fi))
1375 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1377 if (btrfs_extent_readonly(fs_info, disk_bytenr))
1379 ret = btrfs_cross_ref_exist(root, ino,
1381 extent_offset, disk_bytenr);
1384 * ret could be -EIO if the above fails to read
1388 if (cow_start != (u64)-1)
1389 cur_offset = cow_start;
1393 WARN_ON_ONCE(nolock);
1396 disk_bytenr += extent_offset;
1397 disk_bytenr += cur_offset - found_key.offset;
1398 num_bytes = min(end + 1, extent_end) - cur_offset;
1400 * if there are pending snapshots for this root,
1401 * we fall into common COW way.
1404 err = btrfs_start_write_no_snapshotting(root);
1409 * force cow if csum exists in the range.
1410 * this ensure that csum for a given extent are
1411 * either valid or do not exist.
1413 ret = csum_exist_in_range(fs_info, disk_bytenr,
1417 btrfs_end_write_no_snapshotting(root);
1420 * ret could be -EIO if the above fails to read
1424 if (cow_start != (u64)-1)
1425 cur_offset = cow_start;
1428 WARN_ON_ONCE(nolock);
1431 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1433 btrfs_end_write_no_snapshotting(root);
1437 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1438 extent_end = found_key.offset +
1439 btrfs_file_extent_inline_len(leaf,
1440 path->slots[0], fi);
1441 extent_end = ALIGN(extent_end,
1442 fs_info->sectorsize);
1447 if (extent_end <= start) {
1449 if (!nolock && nocow)
1450 btrfs_end_write_no_snapshotting(root);
1452 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1456 if (cow_start == (u64)-1)
1457 cow_start = cur_offset;
1458 cur_offset = extent_end;
1459 if (cur_offset > end)
1465 btrfs_release_path(path);
1466 if (cow_start != (u64)-1) {
1467 ret = cow_file_range(inode, locked_page,
1468 cow_start, found_key.offset - 1,
1469 end, page_started, nr_written, 1,
1472 if (!nolock && nocow)
1473 btrfs_end_write_no_snapshotting(root);
1475 btrfs_dec_nocow_writers(fs_info,
1479 cow_start = (u64)-1;
1482 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1483 u64 orig_start = found_key.offset - extent_offset;
1485 em = create_io_em(inode, cur_offset, num_bytes,
1487 disk_bytenr, /* block_start */
1488 num_bytes, /* block_len */
1489 disk_num_bytes, /* orig_block_len */
1490 ram_bytes, BTRFS_COMPRESS_NONE,
1491 BTRFS_ORDERED_PREALLOC);
1493 if (!nolock && nocow)
1494 btrfs_end_write_no_snapshotting(root);
1496 btrfs_dec_nocow_writers(fs_info,
1501 free_extent_map(em);
1504 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1505 type = BTRFS_ORDERED_PREALLOC;
1507 type = BTRFS_ORDERED_NOCOW;
1510 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1511 num_bytes, num_bytes, type);
1513 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1514 BUG_ON(ret); /* -ENOMEM */
1516 if (root->root_key.objectid ==
1517 BTRFS_DATA_RELOC_TREE_OBJECTID)
1519 * Error handled later, as we must prevent
1520 * extent_clear_unlock_delalloc() in error handler
1521 * from freeing metadata of created ordered extent.
1523 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1526 extent_clear_unlock_delalloc(inode, cur_offset,
1527 cur_offset + num_bytes - 1, end,
1528 locked_page, EXTENT_LOCKED |
1530 EXTENT_CLEAR_DATA_RESV,
1531 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1533 if (!nolock && nocow)
1534 btrfs_end_write_no_snapshotting(root);
1535 cur_offset = extent_end;
1538 * btrfs_reloc_clone_csums() error, now we're OK to call error
1539 * handler, as metadata for created ordered extent will only
1540 * be freed by btrfs_finish_ordered_io().
1544 if (cur_offset > end)
1547 btrfs_release_path(path);
1549 if (cur_offset <= end && cow_start == (u64)-1) {
1550 cow_start = cur_offset;
1554 if (cow_start != (u64)-1) {
1555 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1556 page_started, nr_written, 1, NULL);
1562 if (ret && cur_offset < end)
1563 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1564 locked_page, EXTENT_LOCKED |
1565 EXTENT_DELALLOC | EXTENT_DEFRAG |
1566 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1568 PAGE_SET_WRITEBACK |
1569 PAGE_END_WRITEBACK);
1570 btrfs_free_path(path);
1574 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1577 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1578 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1582 * @defrag_bytes is a hint value, no spinlock held here,
1583 * if is not zero, it means the file is defragging.
1584 * Force cow if given extent needs to be defragged.
1586 if (BTRFS_I(inode)->defrag_bytes &&
1587 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1588 EXTENT_DEFRAG, 0, NULL))
1595 * extent_io.c call back to do delayed allocation processing
1597 static int run_delalloc_range(void *private_data, struct page *locked_page,
1598 u64 start, u64 end, int *page_started,
1599 unsigned long *nr_written,
1600 struct writeback_control *wbc)
1602 struct inode *inode = private_data;
1604 int force_cow = need_force_cow(inode, start, end);
1605 unsigned int write_flags = wbc_to_write_flags(wbc);
1607 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1608 ret = run_delalloc_nocow(inode, locked_page, start, end,
1609 page_started, 1, nr_written);
1610 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1611 ret = run_delalloc_nocow(inode, locked_page, start, end,
1612 page_started, 0, nr_written);
1613 } else if (!inode_need_compress(inode, start, end)) {
1614 ret = cow_file_range(inode, locked_page, start, end, end,
1615 page_started, nr_written, 1, NULL);
1617 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1618 &BTRFS_I(inode)->runtime_flags);
1619 ret = cow_file_range_async(inode, locked_page, start, end,
1620 page_started, nr_written,
1624 btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1628 static void btrfs_split_extent_hook(void *private_data,
1629 struct extent_state *orig, u64 split)
1631 struct inode *inode = private_data;
1634 /* not delalloc, ignore it */
1635 if (!(orig->state & EXTENT_DELALLOC))
1638 size = orig->end - orig->start + 1;
1639 if (size > BTRFS_MAX_EXTENT_SIZE) {
1644 * See the explanation in btrfs_merge_extent_hook, the same
1645 * applies here, just in reverse.
1647 new_size = orig->end - split + 1;
1648 num_extents = count_max_extents(new_size);
1649 new_size = split - orig->start;
1650 num_extents += count_max_extents(new_size);
1651 if (count_max_extents(size) >= num_extents)
1655 spin_lock(&BTRFS_I(inode)->lock);
1656 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1657 spin_unlock(&BTRFS_I(inode)->lock);
1661 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1662 * extents so we can keep track of new extents that are just merged onto old
1663 * extents, such as when we are doing sequential writes, so we can properly
1664 * account for the metadata space we'll need.
1666 static void btrfs_merge_extent_hook(void *private_data,
1667 struct extent_state *new,
1668 struct extent_state *other)
1670 struct inode *inode = private_data;
1671 u64 new_size, old_size;
1674 /* not delalloc, ignore it */
1675 if (!(other->state & EXTENT_DELALLOC))
1678 if (new->start > other->start)
1679 new_size = new->end - other->start + 1;
1681 new_size = other->end - new->start + 1;
1683 /* we're not bigger than the max, unreserve the space and go */
1684 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1685 spin_lock(&BTRFS_I(inode)->lock);
1686 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1687 spin_unlock(&BTRFS_I(inode)->lock);
1692 * We have to add up either side to figure out how many extents were
1693 * accounted for before we merged into one big extent. If the number of
1694 * extents we accounted for is <= the amount we need for the new range
1695 * then we can return, otherwise drop. Think of it like this
1699 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1700 * need 2 outstanding extents, on one side we have 1 and the other side
1701 * we have 1 so they are == and we can return. But in this case
1703 * [MAX_SIZE+4k][MAX_SIZE+4k]
1705 * Each range on their own accounts for 2 extents, but merged together
1706 * they are only 3 extents worth of accounting, so we need to drop in
1709 old_size = other->end - other->start + 1;
1710 num_extents = count_max_extents(old_size);
1711 old_size = new->end - new->start + 1;
1712 num_extents += count_max_extents(old_size);
1713 if (count_max_extents(new_size) >= num_extents)
1716 spin_lock(&BTRFS_I(inode)->lock);
1717 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1718 spin_unlock(&BTRFS_I(inode)->lock);
1721 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1722 struct inode *inode)
1724 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1726 spin_lock(&root->delalloc_lock);
1727 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1728 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1729 &root->delalloc_inodes);
1730 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1731 &BTRFS_I(inode)->runtime_flags);
1732 root->nr_delalloc_inodes++;
1733 if (root->nr_delalloc_inodes == 1) {
1734 spin_lock(&fs_info->delalloc_root_lock);
1735 BUG_ON(!list_empty(&root->delalloc_root));
1736 list_add_tail(&root->delalloc_root,
1737 &fs_info->delalloc_roots);
1738 spin_unlock(&fs_info->delalloc_root_lock);
1741 spin_unlock(&root->delalloc_lock);
1744 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1745 struct btrfs_inode *inode)
1747 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1749 spin_lock(&root->delalloc_lock);
1750 if (!list_empty(&inode->delalloc_inodes)) {
1751 list_del_init(&inode->delalloc_inodes);
1752 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1753 &inode->runtime_flags);
1754 root->nr_delalloc_inodes--;
1755 if (!root->nr_delalloc_inodes) {
1756 spin_lock(&fs_info->delalloc_root_lock);
1757 BUG_ON(list_empty(&root->delalloc_root));
1758 list_del_init(&root->delalloc_root);
1759 spin_unlock(&fs_info->delalloc_root_lock);
1762 spin_unlock(&root->delalloc_lock);
1766 * extent_io.c set_bit_hook, used to track delayed allocation
1767 * bytes in this file, and to maintain the list of inodes that
1768 * have pending delalloc work to be done.
1770 static void btrfs_set_bit_hook(void *private_data,
1771 struct extent_state *state, unsigned *bits)
1773 struct inode *inode = private_data;
1775 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1777 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1780 * set_bit and clear bit hooks normally require _irqsave/restore
1781 * but in this case, we are only testing for the DELALLOC
1782 * bit, which is only set or cleared with irqs on
1784 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1785 struct btrfs_root *root = BTRFS_I(inode)->root;
1786 u64 len = state->end + 1 - state->start;
1787 u32 num_extents = count_max_extents(len);
1788 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1790 spin_lock(&BTRFS_I(inode)->lock);
1791 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1792 spin_unlock(&BTRFS_I(inode)->lock);
1794 /* For sanity tests */
1795 if (btrfs_is_testing(fs_info))
1798 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1799 fs_info->delalloc_batch);
1800 spin_lock(&BTRFS_I(inode)->lock);
1801 BTRFS_I(inode)->delalloc_bytes += len;
1802 if (*bits & EXTENT_DEFRAG)
1803 BTRFS_I(inode)->defrag_bytes += len;
1804 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1805 &BTRFS_I(inode)->runtime_flags))
1806 btrfs_add_delalloc_inodes(root, inode);
1807 spin_unlock(&BTRFS_I(inode)->lock);
1810 if (!(state->state & EXTENT_DELALLOC_NEW) &&
1811 (*bits & EXTENT_DELALLOC_NEW)) {
1812 spin_lock(&BTRFS_I(inode)->lock);
1813 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1815 spin_unlock(&BTRFS_I(inode)->lock);
1820 * extent_io.c clear_bit_hook, see set_bit_hook for why
1822 static void btrfs_clear_bit_hook(void *private_data,
1823 struct extent_state *state,
1826 struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1827 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1828 u64 len = state->end + 1 - state->start;
1829 u32 num_extents = count_max_extents(len);
1831 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1832 spin_lock(&inode->lock);
1833 inode->defrag_bytes -= len;
1834 spin_unlock(&inode->lock);
1838 * set_bit and clear bit hooks normally require _irqsave/restore
1839 * but in this case, we are only testing for the DELALLOC
1840 * bit, which is only set or cleared with irqs on
1842 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1843 struct btrfs_root *root = inode->root;
1844 bool do_list = !btrfs_is_free_space_inode(inode);
1846 spin_lock(&inode->lock);
1847 btrfs_mod_outstanding_extents(inode, -num_extents);
1848 spin_unlock(&inode->lock);
1851 * We don't reserve metadata space for space cache inodes so we
1852 * don't need to call dellalloc_release_metadata if there is an
1855 if (*bits & EXTENT_CLEAR_META_RESV &&
1856 root != fs_info->tree_root)
1857 btrfs_delalloc_release_metadata(inode, len, false);
1859 /* For sanity tests. */
1860 if (btrfs_is_testing(fs_info))
1863 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1864 do_list && !(state->state & EXTENT_NORESERVE) &&
1865 (*bits & EXTENT_CLEAR_DATA_RESV))
1866 btrfs_free_reserved_data_space_noquota(
1870 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1871 fs_info->delalloc_batch);
1872 spin_lock(&inode->lock);
1873 inode->delalloc_bytes -= len;
1874 if (do_list && inode->delalloc_bytes == 0 &&
1875 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1876 &inode->runtime_flags))
1877 btrfs_del_delalloc_inode(root, inode);
1878 spin_unlock(&inode->lock);
1881 if ((state->state & EXTENT_DELALLOC_NEW) &&
1882 (*bits & EXTENT_DELALLOC_NEW)) {
1883 spin_lock(&inode->lock);
1884 ASSERT(inode->new_delalloc_bytes >= len);
1885 inode->new_delalloc_bytes -= len;
1886 spin_unlock(&inode->lock);
1891 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1892 * we don't create bios that span stripes or chunks
1894 * return 1 if page cannot be merged to bio
1895 * return 0 if page can be merged to bio
1896 * return error otherwise
1898 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1899 size_t size, struct bio *bio,
1900 unsigned long bio_flags)
1902 struct inode *inode = page->mapping->host;
1903 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1904 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1909 if (bio_flags & EXTENT_BIO_COMPRESSED)
1912 length = bio->bi_iter.bi_size;
1913 map_length = length;
1914 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1918 if (map_length < length + size)
1924 * in order to insert checksums into the metadata in large chunks,
1925 * we wait until bio submission time. All the pages in the bio are
1926 * checksummed and sums are attached onto the ordered extent record.
1928 * At IO completion time the cums attached on the ordered extent record
1929 * are inserted into the btree
1931 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1934 struct inode *inode = private_data;
1935 blk_status_t ret = 0;
1937 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1938 BUG_ON(ret); /* -ENOMEM */
1943 * in order to insert checksums into the metadata in large chunks,
1944 * we wait until bio submission time. All the pages in the bio are
1945 * checksummed and sums are attached onto the ordered extent record.
1947 * At IO completion time the cums attached on the ordered extent record
1948 * are inserted into the btree
1950 static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
1953 struct inode *inode = private_data;
1954 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1957 ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1959 bio->bi_status = ret;
1966 * extent_io.c submission hook. This does the right thing for csum calculation
1967 * on write, or reading the csums from the tree before a read.
1969 * Rules about async/sync submit,
1970 * a) read: sync submit
1972 * b) write without checksum: sync submit
1974 * c) write with checksum:
1975 * c-1) if bio is issued by fsync: sync submit
1976 * (sync_writers != 0)
1978 * c-2) if root is reloc root: sync submit
1979 * (only in case of buffered IO)
1981 * c-3) otherwise: async submit
1983 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1984 int mirror_num, unsigned long bio_flags,
1987 struct inode *inode = private_data;
1988 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1989 struct btrfs_root *root = BTRFS_I(inode)->root;
1990 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1991 blk_status_t ret = 0;
1993 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1995 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1997 if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1998 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2000 if (bio_op(bio) != REQ_OP_WRITE) {
2001 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2005 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2006 ret = btrfs_submit_compressed_read(inode, bio,
2010 } else if (!skip_sum) {
2011 ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2016 } else if (async && !skip_sum) {
2017 /* csum items have already been cloned */
2018 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2020 /* we're doing a write, do the async checksumming */
2021 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2023 btrfs_submit_bio_start,
2024 btrfs_submit_bio_done);
2026 } else if (!skip_sum) {
2027 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2033 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2037 bio->bi_status = ret;
2044 * given a list of ordered sums record them in the inode. This happens
2045 * at IO completion time based on sums calculated at bio submission time.
2047 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2048 struct inode *inode, struct list_head *list)
2050 struct btrfs_ordered_sum *sum;
2053 list_for_each_entry(sum, list, list) {
2054 trans->adding_csums = true;
2055 ret = btrfs_csum_file_blocks(trans,
2056 BTRFS_I(inode)->root->fs_info->csum_root, sum);
2057 trans->adding_csums = false;
2064 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2065 unsigned int extra_bits,
2066 struct extent_state **cached_state, int dedupe)
2068 WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2069 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2070 extra_bits, cached_state);
2073 /* see btrfs_writepage_start_hook for details on why this is required */
2074 struct btrfs_writepage_fixup {
2076 struct btrfs_work work;
2079 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2081 struct btrfs_writepage_fixup *fixup;
2082 struct btrfs_ordered_extent *ordered;
2083 struct extent_state *cached_state = NULL;
2084 struct extent_changeset *data_reserved = NULL;
2086 struct inode *inode;
2091 fixup = container_of(work, struct btrfs_writepage_fixup, work);
2095 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2096 ClearPageChecked(page);
2100 inode = page->mapping->host;
2101 page_start = page_offset(page);
2102 page_end = page_offset(page) + PAGE_SIZE - 1;
2104 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2107 /* already ordered? We're done */
2108 if (PagePrivate2(page))
2111 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2114 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2115 page_end, &cached_state);
2117 btrfs_start_ordered_extent(inode, ordered, 1);
2118 btrfs_put_ordered_extent(ordered);
2122 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2125 mapping_set_error(page->mapping, ret);
2126 end_extent_writepage(page, ret, page_start, page_end);
2127 ClearPageChecked(page);
2131 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2134 mapping_set_error(page->mapping, ret);
2135 end_extent_writepage(page, ret, page_start, page_end);
2136 ClearPageChecked(page);
2140 ClearPageChecked(page);
2141 set_page_dirty(page);
2142 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2144 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2150 extent_changeset_free(data_reserved);
2154 * There are a few paths in the higher layers of the kernel that directly
2155 * set the page dirty bit without asking the filesystem if it is a
2156 * good idea. This causes problems because we want to make sure COW
2157 * properly happens and the data=ordered rules are followed.
2159 * In our case any range that doesn't have the ORDERED bit set
2160 * hasn't been properly setup for IO. We kick off an async process
2161 * to fix it up. The async helper will wait for ordered extents, set
2162 * the delalloc bit and make it safe to write the page.
2164 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2166 struct inode *inode = page->mapping->host;
2167 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2168 struct btrfs_writepage_fixup *fixup;
2170 /* this page is properly in the ordered list */
2171 if (TestClearPagePrivate2(page))
2174 if (PageChecked(page))
2177 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2181 SetPageChecked(page);
2183 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2184 btrfs_writepage_fixup_worker, NULL, NULL);
2186 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2190 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2191 struct inode *inode, u64 file_pos,
2192 u64 disk_bytenr, u64 disk_num_bytes,
2193 u64 num_bytes, u64 ram_bytes,
2194 u8 compression, u8 encryption,
2195 u16 other_encoding, int extent_type)
2197 struct btrfs_root *root = BTRFS_I(inode)->root;
2198 struct btrfs_file_extent_item *fi;
2199 struct btrfs_path *path;
2200 struct extent_buffer *leaf;
2201 struct btrfs_key ins;
2203 int extent_inserted = 0;
2206 path = btrfs_alloc_path();
2211 * we may be replacing one extent in the tree with another.
2212 * The new extent is pinned in the extent map, and we don't want
2213 * to drop it from the cache until it is completely in the btree.
2215 * So, tell btrfs_drop_extents to leave this extent in the cache.
2216 * the caller is expected to unpin it and allow it to be merged
2219 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2220 file_pos + num_bytes, NULL, 0,
2221 1, sizeof(*fi), &extent_inserted);
2225 if (!extent_inserted) {
2226 ins.objectid = btrfs_ino(BTRFS_I(inode));
2227 ins.offset = file_pos;
2228 ins.type = BTRFS_EXTENT_DATA_KEY;
2230 path->leave_spinning = 1;
2231 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2236 leaf = path->nodes[0];
2237 fi = btrfs_item_ptr(leaf, path->slots[0],
2238 struct btrfs_file_extent_item);
2239 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2240 btrfs_set_file_extent_type(leaf, fi, extent_type);
2241 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2242 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2243 btrfs_set_file_extent_offset(leaf, fi, 0);
2244 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2245 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2246 btrfs_set_file_extent_compression(leaf, fi, compression);
2247 btrfs_set_file_extent_encryption(leaf, fi, encryption);
2248 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2250 btrfs_mark_buffer_dirty(leaf);
2251 btrfs_release_path(path);
2253 inode_add_bytes(inode, num_bytes);
2255 ins.objectid = disk_bytenr;
2256 ins.offset = disk_num_bytes;
2257 ins.type = BTRFS_EXTENT_ITEM_KEY;
2260 * Release the reserved range from inode dirty range map, as it is
2261 * already moved into delayed_ref_head
2263 ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2267 ret = btrfs_alloc_reserved_file_extent(trans, root,
2268 btrfs_ino(BTRFS_I(inode)),
2269 file_pos, qg_released, &ins);
2271 btrfs_free_path(path);
2276 /* snapshot-aware defrag */
2277 struct sa_defrag_extent_backref {
2278 struct rb_node node;
2279 struct old_sa_defrag_extent *old;
2288 struct old_sa_defrag_extent {
2289 struct list_head list;
2290 struct new_sa_defrag_extent *new;
2299 struct new_sa_defrag_extent {
2300 struct rb_root root;
2301 struct list_head head;
2302 struct btrfs_path *path;
2303 struct inode *inode;
2311 static int backref_comp(struct sa_defrag_extent_backref *b1,
2312 struct sa_defrag_extent_backref *b2)
2314 if (b1->root_id < b2->root_id)
2316 else if (b1->root_id > b2->root_id)
2319 if (b1->inum < b2->inum)
2321 else if (b1->inum > b2->inum)
2324 if (b1->file_pos < b2->file_pos)
2326 else if (b1->file_pos > b2->file_pos)
2330 * [------------------------------] ===> (a range of space)
2331 * |<--->| |<---->| =============> (fs/file tree A)
2332 * |<---------------------------->| ===> (fs/file tree B)
2334 * A range of space can refer to two file extents in one tree while
2335 * refer to only one file extent in another tree.
2337 * So we may process a disk offset more than one time(two extents in A)
2338 * and locate at the same extent(one extent in B), then insert two same
2339 * backrefs(both refer to the extent in B).
2344 static void backref_insert(struct rb_root *root,
2345 struct sa_defrag_extent_backref *backref)
2347 struct rb_node **p = &root->rb_node;
2348 struct rb_node *parent = NULL;
2349 struct sa_defrag_extent_backref *entry;
2354 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2356 ret = backref_comp(backref, entry);
2360 p = &(*p)->rb_right;
2363 rb_link_node(&backref->node, parent, p);
2364 rb_insert_color(&backref->node, root);
2368 * Note the backref might has changed, and in this case we just return 0.
2370 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2373 struct btrfs_file_extent_item *extent;
2374 struct old_sa_defrag_extent *old = ctx;
2375 struct new_sa_defrag_extent *new = old->new;
2376 struct btrfs_path *path = new->path;
2377 struct btrfs_key key;
2378 struct btrfs_root *root;
2379 struct sa_defrag_extent_backref *backref;
2380 struct extent_buffer *leaf;
2381 struct inode *inode = new->inode;
2382 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2388 if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2389 inum == btrfs_ino(BTRFS_I(inode)))
2392 key.objectid = root_id;
2393 key.type = BTRFS_ROOT_ITEM_KEY;
2394 key.offset = (u64)-1;
2396 root = btrfs_read_fs_root_no_name(fs_info, &key);
2398 if (PTR_ERR(root) == -ENOENT)
2401 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2402 inum, offset, root_id);
2403 return PTR_ERR(root);
2406 key.objectid = inum;
2407 key.type = BTRFS_EXTENT_DATA_KEY;
2408 if (offset > (u64)-1 << 32)
2411 key.offset = offset;
2413 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2414 if (WARN_ON(ret < 0))
2421 leaf = path->nodes[0];
2422 slot = path->slots[0];
2424 if (slot >= btrfs_header_nritems(leaf)) {
2425 ret = btrfs_next_leaf(root, path);
2428 } else if (ret > 0) {
2437 btrfs_item_key_to_cpu(leaf, &key, slot);
2439 if (key.objectid > inum)
2442 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2445 extent = btrfs_item_ptr(leaf, slot,
2446 struct btrfs_file_extent_item);
2448 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2452 * 'offset' refers to the exact key.offset,
2453 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2454 * (key.offset - extent_offset).
2456 if (key.offset != offset)
2459 extent_offset = btrfs_file_extent_offset(leaf, extent);
2460 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2462 if (extent_offset >= old->extent_offset + old->offset +
2463 old->len || extent_offset + num_bytes <=
2464 old->extent_offset + old->offset)
2469 backref = kmalloc(sizeof(*backref), GFP_NOFS);
2475 backref->root_id = root_id;
2476 backref->inum = inum;
2477 backref->file_pos = offset;
2478 backref->num_bytes = num_bytes;
2479 backref->extent_offset = extent_offset;
2480 backref->generation = btrfs_file_extent_generation(leaf, extent);
2482 backref_insert(&new->root, backref);
2485 btrfs_release_path(path);
2490 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2491 struct new_sa_defrag_extent *new)
2493 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2494 struct old_sa_defrag_extent *old, *tmp;
2499 list_for_each_entry_safe(old, tmp, &new->head, list) {
2500 ret = iterate_inodes_from_logical(old->bytenr +
2501 old->extent_offset, fs_info,
2502 path, record_one_backref,
2504 if (ret < 0 && ret != -ENOENT)
2507 /* no backref to be processed for this extent */
2509 list_del(&old->list);
2514 if (list_empty(&new->head))
2520 static int relink_is_mergable(struct extent_buffer *leaf,
2521 struct btrfs_file_extent_item *fi,
2522 struct new_sa_defrag_extent *new)
2524 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2527 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2530 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2533 if (btrfs_file_extent_encryption(leaf, fi) ||
2534 btrfs_file_extent_other_encoding(leaf, fi))
2541 * Note the backref might has changed, and in this case we just return 0.
2543 static noinline int relink_extent_backref(struct btrfs_path *path,
2544 struct sa_defrag_extent_backref *prev,
2545 struct sa_defrag_extent_backref *backref)
2547 struct btrfs_file_extent_item *extent;
2548 struct btrfs_file_extent_item *item;
2549 struct btrfs_ordered_extent *ordered;
2550 struct btrfs_trans_handle *trans;
2551 struct btrfs_root *root;
2552 struct btrfs_key key;
2553 struct extent_buffer *leaf;
2554 struct old_sa_defrag_extent *old = backref->old;
2555 struct new_sa_defrag_extent *new = old->new;
2556 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2557 struct inode *inode;
2558 struct extent_state *cached = NULL;
2567 if (prev && prev->root_id == backref->root_id &&
2568 prev->inum == backref->inum &&
2569 prev->file_pos + prev->num_bytes == backref->file_pos)
2572 /* step 1: get root */
2573 key.objectid = backref->root_id;
2574 key.type = BTRFS_ROOT_ITEM_KEY;
2575 key.offset = (u64)-1;
2577 index = srcu_read_lock(&fs_info->subvol_srcu);
2579 root = btrfs_read_fs_root_no_name(fs_info, &key);
2581 srcu_read_unlock(&fs_info->subvol_srcu, index);
2582 if (PTR_ERR(root) == -ENOENT)
2584 return PTR_ERR(root);
2587 if (btrfs_root_readonly(root)) {
2588 srcu_read_unlock(&fs_info->subvol_srcu, index);
2592 /* step 2: get inode */
2593 key.objectid = backref->inum;
2594 key.type = BTRFS_INODE_ITEM_KEY;
2597 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2598 if (IS_ERR(inode)) {
2599 srcu_read_unlock(&fs_info->subvol_srcu, index);
2603 srcu_read_unlock(&fs_info->subvol_srcu, index);
2605 /* step 3: relink backref */
2606 lock_start = backref->file_pos;
2607 lock_end = backref->file_pos + backref->num_bytes - 1;
2608 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2611 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2613 btrfs_put_ordered_extent(ordered);
2617 trans = btrfs_join_transaction(root);
2618 if (IS_ERR(trans)) {
2619 ret = PTR_ERR(trans);
2623 key.objectid = backref->inum;
2624 key.type = BTRFS_EXTENT_DATA_KEY;
2625 key.offset = backref->file_pos;
2627 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2630 } else if (ret > 0) {
2635 extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2636 struct btrfs_file_extent_item);
2638 if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2639 backref->generation)
2642 btrfs_release_path(path);
2644 start = backref->file_pos;
2645 if (backref->extent_offset < old->extent_offset + old->offset)
2646 start += old->extent_offset + old->offset -
2647 backref->extent_offset;
2649 len = min(backref->extent_offset + backref->num_bytes,
2650 old->extent_offset + old->offset + old->len);
2651 len -= max(backref->extent_offset, old->extent_offset + old->offset);
2653 ret = btrfs_drop_extents(trans, root, inode, start,
2658 key.objectid = btrfs_ino(BTRFS_I(inode));
2659 key.type = BTRFS_EXTENT_DATA_KEY;
2662 path->leave_spinning = 1;
2664 struct btrfs_file_extent_item *fi;
2666 struct btrfs_key found_key;
2668 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2673 leaf = path->nodes[0];
2674 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2676 fi = btrfs_item_ptr(leaf, path->slots[0],
2677 struct btrfs_file_extent_item);
2678 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2680 if (extent_len + found_key.offset == start &&
2681 relink_is_mergable(leaf, fi, new)) {
2682 btrfs_set_file_extent_num_bytes(leaf, fi,
2684 btrfs_mark_buffer_dirty(leaf);
2685 inode_add_bytes(inode, len);
2691 btrfs_release_path(path);
2696 ret = btrfs_insert_empty_item(trans, root, path, &key,
2699 btrfs_abort_transaction(trans, ret);
2703 leaf = path->nodes[0];
2704 item = btrfs_item_ptr(leaf, path->slots[0],
2705 struct btrfs_file_extent_item);
2706 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2707 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2708 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2709 btrfs_set_file_extent_num_bytes(leaf, item, len);
2710 btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2711 btrfs_set_file_extent_generation(leaf, item, trans->transid);
2712 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2713 btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2714 btrfs_set_file_extent_encryption(leaf, item, 0);
2715 btrfs_set_file_extent_other_encoding(leaf, item, 0);
2717 btrfs_mark_buffer_dirty(leaf);
2718 inode_add_bytes(inode, len);
2719 btrfs_release_path(path);
2721 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2723 backref->root_id, backref->inum,
2724 new->file_pos); /* start - extent_offset */
2726 btrfs_abort_transaction(trans, ret);
2732 btrfs_release_path(path);
2733 path->leave_spinning = 0;
2734 btrfs_end_transaction(trans);
2736 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2742 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2744 struct old_sa_defrag_extent *old, *tmp;
2749 list_for_each_entry_safe(old, tmp, &new->head, list) {
2755 static void relink_file_extents(struct new_sa_defrag_extent *new)
2757 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2758 struct btrfs_path *path;
2759 struct sa_defrag_extent_backref *backref;
2760 struct sa_defrag_extent_backref *prev = NULL;
2761 struct inode *inode;
2762 struct rb_node *node;
2767 path = btrfs_alloc_path();
2771 if (!record_extent_backrefs(path, new)) {
2772 btrfs_free_path(path);
2775 btrfs_release_path(path);
2778 node = rb_first(&new->root);
2781 rb_erase(node, &new->root);
2783 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2785 ret = relink_extent_backref(path, prev, backref);
2798 btrfs_free_path(path);
2800 free_sa_defrag_extent(new);
2802 atomic_dec(&fs_info->defrag_running);
2803 wake_up(&fs_info->transaction_wait);
2806 static struct new_sa_defrag_extent *
2807 record_old_file_extents(struct inode *inode,
2808 struct btrfs_ordered_extent *ordered)
2810 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2811 struct btrfs_root *root = BTRFS_I(inode)->root;
2812 struct btrfs_path *path;
2813 struct btrfs_key key;
2814 struct old_sa_defrag_extent *old;
2815 struct new_sa_defrag_extent *new;
2818 new = kmalloc(sizeof(*new), GFP_NOFS);
2823 new->file_pos = ordered->file_offset;
2824 new->len = ordered->len;
2825 new->bytenr = ordered->start;
2826 new->disk_len = ordered->disk_len;
2827 new->compress_type = ordered->compress_type;
2828 new->root = RB_ROOT;
2829 INIT_LIST_HEAD(&new->head);
2831 path = btrfs_alloc_path();
2835 key.objectid = btrfs_ino(BTRFS_I(inode));
2836 key.type = BTRFS_EXTENT_DATA_KEY;
2837 key.offset = new->file_pos;
2839 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2842 if (ret > 0 && path->slots[0] > 0)
2845 /* find out all the old extents for the file range */
2847 struct btrfs_file_extent_item *extent;
2848 struct extent_buffer *l;
2857 slot = path->slots[0];
2859 if (slot >= btrfs_header_nritems(l)) {
2860 ret = btrfs_next_leaf(root, path);
2868 btrfs_item_key_to_cpu(l, &key, slot);
2870 if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2872 if (key.type != BTRFS_EXTENT_DATA_KEY)
2874 if (key.offset >= new->file_pos + new->len)
2877 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2879 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2880 if (key.offset + num_bytes < new->file_pos)
2883 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2887 extent_offset = btrfs_file_extent_offset(l, extent);
2889 old = kmalloc(sizeof(*old), GFP_NOFS);
2893 offset = max(new->file_pos, key.offset);
2894 end = min(new->file_pos + new->len, key.offset + num_bytes);
2896 old->bytenr = disk_bytenr;
2897 old->extent_offset = extent_offset;
2898 old->offset = offset - key.offset;
2899 old->len = end - offset;
2902 list_add_tail(&old->list, &new->head);
2908 btrfs_free_path(path);
2909 atomic_inc(&fs_info->defrag_running);
2914 btrfs_free_path(path);
2916 free_sa_defrag_extent(new);
2920 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2923 struct btrfs_block_group_cache *cache;
2925 cache = btrfs_lookup_block_group(fs_info, start);
2928 spin_lock(&cache->lock);
2929 cache->delalloc_bytes -= len;
2930 spin_unlock(&cache->lock);
2932 btrfs_put_block_group(cache);
2935 /* as ordered data IO finishes, this gets called so we can finish
2936 * an ordered extent if the range of bytes in the file it covers are
2939 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2941 struct inode *inode = ordered_extent->inode;
2942 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2943 struct btrfs_root *root = BTRFS_I(inode)->root;
2944 struct btrfs_trans_handle *trans = NULL;
2945 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2946 struct extent_state *cached_state = NULL;
2947 struct new_sa_defrag_extent *new = NULL;
2948 int compress_type = 0;
2950 u64 logical_len = ordered_extent->len;
2952 bool truncated = false;
2953 bool range_locked = false;
2954 bool clear_new_delalloc_bytes = false;
2956 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2957 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2958 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2959 clear_new_delalloc_bytes = true;
2961 nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2963 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2968 btrfs_free_io_failure_record(BTRFS_I(inode),
2969 ordered_extent->file_offset,
2970 ordered_extent->file_offset +
2971 ordered_extent->len - 1);
2973 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2975 logical_len = ordered_extent->truncated_len;
2976 /* Truncated the entire extent, don't bother adding */
2981 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2982 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2985 * For mwrite(mmap + memset to write) case, we still reserve
2986 * space for NOCOW range.
2987 * As NOCOW won't cause a new delayed ref, just free the space
2989 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2990 ordered_extent->len);
2991 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2993 trans = btrfs_join_transaction_nolock(root);
2995 trans = btrfs_join_transaction(root);
2996 if (IS_ERR(trans)) {
2997 ret = PTR_ERR(trans);
3001 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3002 ret = btrfs_update_inode_fallback(trans, root, inode);
3003 if (ret) /* -ENOMEM or corruption */
3004 btrfs_abort_transaction(trans, ret);
3008 range_locked = true;
3009 lock_extent_bits(io_tree, ordered_extent->file_offset,
3010 ordered_extent->file_offset + ordered_extent->len - 1,
3013 ret = test_range_bit(io_tree, ordered_extent->file_offset,
3014 ordered_extent->file_offset + ordered_extent->len - 1,
3015 EXTENT_DEFRAG, 0, cached_state);
3017 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3018 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3019 /* the inode is shared */
3020 new = record_old_file_extents(inode, ordered_extent);
3022 clear_extent_bit(io_tree, ordered_extent->file_offset,
3023 ordered_extent->file_offset + ordered_extent->len - 1,
3024 EXTENT_DEFRAG, 0, 0, &cached_state);
3028 trans = btrfs_join_transaction_nolock(root);
3030 trans = btrfs_join_transaction(root);
3031 if (IS_ERR(trans)) {
3032 ret = PTR_ERR(trans);
3037 trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3039 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3040 compress_type = ordered_extent->compress_type;
3041 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3042 BUG_ON(compress_type);
3043 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3044 ordered_extent->len);
3045 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3046 ordered_extent->file_offset,
3047 ordered_extent->file_offset +
3050 BUG_ON(root == fs_info->tree_root);
3051 ret = insert_reserved_file_extent(trans, inode,
3052 ordered_extent->file_offset,
3053 ordered_extent->start,
3054 ordered_extent->disk_len,
3055 logical_len, logical_len,
3056 compress_type, 0, 0,
3057 BTRFS_FILE_EXTENT_REG);
3059 btrfs_release_delalloc_bytes(fs_info,
3060 ordered_extent->start,
3061 ordered_extent->disk_len);
3063 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3064 ordered_extent->file_offset, ordered_extent->len,
3067 btrfs_abort_transaction(trans, ret);
3071 ret = add_pending_csums(trans, inode, &ordered_extent->list);
3073 btrfs_abort_transaction(trans, ret);
3077 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3078 ret = btrfs_update_inode_fallback(trans, root, inode);
3079 if (ret) { /* -ENOMEM or corruption */
3080 btrfs_abort_transaction(trans, ret);
3085 if (range_locked || clear_new_delalloc_bytes) {
3086 unsigned int clear_bits = 0;
3089 clear_bits |= EXTENT_LOCKED;
3090 if (clear_new_delalloc_bytes)
3091 clear_bits |= EXTENT_DELALLOC_NEW;
3092 clear_extent_bit(&BTRFS_I(inode)->io_tree,
3093 ordered_extent->file_offset,
3094 ordered_extent->file_offset +
3095 ordered_extent->len - 1,
3097 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3102 btrfs_end_transaction(trans);
3104 if (ret || truncated) {
3108 start = ordered_extent->file_offset + logical_len;
3110 start = ordered_extent->file_offset;
3111 end = ordered_extent->file_offset + ordered_extent->len - 1;
3112 clear_extent_uptodate(io_tree, start, end, NULL);
3114 /* Drop the cache for the part of the extent we didn't write. */
3115 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3118 * If the ordered extent had an IOERR or something else went
3119 * wrong we need to return the space for this ordered extent
3120 * back to the allocator. We only free the extent in the
3121 * truncated case if we didn't write out the extent at all.
3123 if ((ret || !logical_len) &&
3124 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3125 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3126 btrfs_free_reserved_extent(fs_info,
3127 ordered_extent->start,
3128 ordered_extent->disk_len, 1);
3133 * This needs to be done to make sure anybody waiting knows we are done
3134 * updating everything for this ordered extent.
3136 btrfs_remove_ordered_extent(inode, ordered_extent);
3138 /* for snapshot-aware defrag */
3141 free_sa_defrag_extent(new);
3142 atomic_dec(&fs_info->defrag_running);
3144 relink_file_extents(new);
3149 btrfs_put_ordered_extent(ordered_extent);
3150 /* once for the tree */
3151 btrfs_put_ordered_extent(ordered_extent);
3156 static void finish_ordered_fn(struct btrfs_work *work)
3158 struct btrfs_ordered_extent *ordered_extent;
3159 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3160 btrfs_finish_ordered_io(ordered_extent);
3163 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3164 struct extent_state *state, int uptodate)
3166 struct inode *inode = page->mapping->host;
3167 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3168 struct btrfs_ordered_extent *ordered_extent = NULL;
3169 struct btrfs_workqueue *wq;
3170 btrfs_work_func_t func;
3172 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3174 ClearPagePrivate2(page);
3175 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3176 end - start + 1, uptodate))
3179 if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3180 wq = fs_info->endio_freespace_worker;
3181 func = btrfs_freespace_write_helper;
3183 wq = fs_info->endio_write_workers;
3184 func = btrfs_endio_write_helper;
3187 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3189 btrfs_queue_work(wq, &ordered_extent->work);
3192 static int __readpage_endio_check(struct inode *inode,
3193 struct btrfs_io_bio *io_bio,
3194 int icsum, struct page *page,
3195 int pgoff, u64 start, size_t len)
3201 csum_expected = *(((u32 *)io_bio->csum) + icsum);
3203 kaddr = kmap_atomic(page);
3204 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3205 btrfs_csum_final(csum, (u8 *)&csum);
3206 if (csum != csum_expected)
3209 kunmap_atomic(kaddr);
3212 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3213 io_bio->mirror_num);
3214 memset(kaddr + pgoff, 1, len);
3215 flush_dcache_page(page);
3216 kunmap_atomic(kaddr);
3221 * when reads are done, we need to check csums to verify the data is correct
3222 * if there's a match, we allow the bio to finish. If not, the code in
3223 * extent_io.c will try to find good copies for us.