fs/btrfs/space-info.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "ctree.h"
   4 #include "space-info.h"
   5 #include "sysfs.h"
   6 #include "volumes.h"
   7 #include "free-space-cache.h"
   8 #include "ordered-data.h"
   9 #include "transaction.h"
  10 #include "math.h"
  11
  12 u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
  13                           bool may_use_included)
  14 {
  15         ASSERT(s_info);
  16         return s_info->bytes_used + s_info->bytes_reserved +
  17                 s_info->bytes_pinned + s_info->bytes_readonly +
  18                 (may_use_included ? s_info->bytes_may_use : 0);
  19 }
  20
  21 /*
  22  * after adding space to the filesystem, we need to clear the full flags
  23  * on all the space infos.
  24  */
  25 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  26 {
  27         struct list_head *head = &info->space_info;
  28         struct btrfs_space_info *found;
  29
  30         rcu_read_lock();
  31         list_for_each_entry_rcu(found, head, list)
  32                 found->full = 0;
  33         rcu_read_unlock();
  34 }
  35
  36 static const char *alloc_name(u64 flags)
  37 {
  38         switch (flags) {
  39         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
  40                 return "mixed";
  41         case BTRFS_BLOCK_GROUP_METADATA:
  42                 return "metadata";
  43         case BTRFS_BLOCK_GROUP_DATA:
  44                 return "data";
  45         case BTRFS_BLOCK_GROUP_SYSTEM:
  46                 return "system";
  47         default:
  48                 WARN_ON(1);
  49                 return "invalid-combination";
  50         };
  51 }
  52
  53 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
  54 {
  55
  56         struct btrfs_space_info *space_info;
  57         int i;
  58         int ret;
  59
  60         space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
  61         if (!space_info)
  62                 return -ENOMEM;
  63
  64         ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
  65                                  GFP_KERNEL);
  66         if (ret) {
  67                 kfree(space_info);
  68                 return ret;
  69         }
  70
  71         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
  72                 INIT_LIST_HEAD(&space_info->block_groups[i]);
  73         init_rwsem(&space_info->groups_sem);
  74         spin_lock_init(&space_info->lock);
  75         space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
  76         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
  77         init_waitqueue_head(&space_info->wait);
  78         INIT_LIST_HEAD(&space_info->ro_bgs);
  79         INIT_LIST_HEAD(&space_info->tickets);
  80         INIT_LIST_HEAD(&space_info->priority_tickets);
  81
  82         ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
  83                                     info->space_info_kobj, "%s",
  84                                     alloc_name(space_info->flags));
  85         if (ret) {
  86                 kobject_put(&space_info->kobj);
  87                 return ret;
  88         }
  89
  90         list_add_rcu(&space_info->list, &info->space_info);
  91         if (flags & BTRFS_BLOCK_GROUP_DATA)
  92                 info->data_sinfo = space_info;
  93
  94         return ret;
  95 }
  96
  97 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
  98 {
  99         struct btrfs_super_block *disk_super;
 100         u64 features;
 101         u64 flags;
 102         int mixed = 0;
 103         int ret;
 104
 105         disk_super = fs_info->super_copy;
 106         if (!btrfs_super_root(disk_super))
 107                 return -EINVAL;
 108
 109         features = btrfs_super_incompat_flags(disk_super);
 110         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
 111                 mixed = 1;
 112
 113         flags = BTRFS_BLOCK_GROUP_SYSTEM;
 114         ret = create_space_info(fs_info, flags);
 115         if (ret)
 116                 goto out;
 117
 118         if (mixed) {
 119                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
 120                 ret = create_space_info(fs_info, flags);
 121         } else {
 122                 flags = BTRFS_BLOCK_GROUP_METADATA;
 123                 ret = create_space_info(fs_info, flags);
 124                 if (ret)
 125                         goto out;
 126
 127                 flags = BTRFS_BLOCK_GROUP_DATA;
 128                 ret = create_space_info(fs_info, flags);
 129         }
 130 out:
 131         return ret;
 132 }
 133
 134 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
 135                              u64 total_bytes, u64 bytes_used,
 136                              u64 bytes_readonly,
 137                              struct btrfs_space_info **space_info)
 138 {
 139         struct btrfs_space_info *found;
 140         int factor;
 141
 142         factor = btrfs_bg_type_to_factor(flags);
 143
 144         found = btrfs_find_space_info(info, flags);
 145         ASSERT(found);
 146         spin_lock(&found->lock);
 147         found->total_bytes += total_bytes;
 148         found->disk_total += total_bytes * factor;
 149         found->bytes_used += bytes_used;
 150         found->disk_used += bytes_used * factor;
 151         found->bytes_readonly += bytes_readonly;
 152         if (total_bytes > 0)
 153                 found->full = 0;
 154         btrfs_space_info_add_new_bytes(info, found,
 155                                        total_bytes - bytes_used -
 156                                        bytes_readonly);
 157         spin_unlock(&found->lock);
 158         *space_info = found;
 159 }
 160
 161 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
 162                                                u64 flags)
 163 {
 164         struct list_head *head = &info->space_info;
 165         struct btrfs_space_info *found;
 166
 167         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 168
 169         rcu_read_lock();
 170         list_for_each_entry_rcu(found, head, list) {
 171                 if (found->flags & flags) {
 172                         rcu_read_unlock();
 173                         return found;
 174                 }
 175         }
 176         rcu_read_unlock();
 177         return NULL;
 178 }
 179
 180 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 181 {
 182         return (global->size << 1);
 183 }
 184
 185 static int can_overcommit(struct btrfs_fs_info *fs_info,
 186                           struct btrfs_space_info *space_info, u64 bytes,
 187                           enum btrfs_reserve_flush_enum flush,
 188                           bool system_chunk)
 189 {
 190         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 191         u64 profile;
 192         u64 space_size;
 193         u64 avail;
 194         u64 used;
 195         int factor;
 196
 197         /* Don't overcommit when in mixed mode. */
 198         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
 199                 return 0;
 200
 201         if (system_chunk)
 202                 profile = btrfs_system_alloc_profile(fs_info);
 203         else
 204                 profile = btrfs_metadata_alloc_profile(fs_info);
 205
 206         used = btrfs_space_info_used(space_info, false);
 207
 208         /*
 209          * We only want to allow over committing if we have lots of actual space
 210          * free, but if we don't have enough space to handle the global reserve
 211          * space then we could end up having a real enospc problem when trying
 212          * to allocate a chunk or some other such important allocation.
 213          */
 214         spin_lock(&global_rsv->lock);
 215         space_size = calc_global_rsv_need_space(global_rsv);
 216         spin_unlock(&global_rsv->lock);
 217         if (used + space_size >= space_info->total_bytes)
 218                 return 0;
 219
 220         used += space_info->bytes_may_use;
 221
 222         avail = atomic64_read(&fs_info->free_chunk_space);
 223
 224         /*
 225          * If we have dup, raid1 or raid10 then only half of the free
 226          * space is actually usable.  For raid56, the space info used
 227          * doesn't include the parity drive, so we don't have to
 228          * change the math
 229          */
 230         factor = btrfs_bg_type_to_factor(profile);
 231         avail = div_u64(avail, factor);
 232
 233         /*
 234          * If we aren't flushing all things, let us overcommit up to
 235          * 1/2th of the space. If we can flush, don't let us overcommit
 236          * too much, let it overcommit up to 1/8 of the space.
 237          */
 238         if (flush == BTRFS_RESERVE_FLUSH_ALL)
 239                 avail >>= 3;
 240         else
 241                 avail >>= 1;
 242
 243         if (used + bytes < space_info->total_bytes + avail)
 244                 return 1;
 245         return 0;
 246 }
 247
 248 /*
 249  * This is for space we already have accounted in space_info->bytes_may_use, so
 250  * basically when we're returning space from block_rsv's.
 251  */
 252 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 253                                     struct btrfs_space_info *space_info,
 254                                     u64 num_bytes)
 255 {
 256         struct reserve_ticket *ticket;
 257         struct list_head *head;
 258         u64 used;
 259         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
 260         bool check_overcommit = false;
 261
 262         spin_lock(&space_info->lock);
 263         head = &space_info->priority_tickets;
 264
 265         /*
 266          * If we are over our limit then we need to check and see if we can
 267          * overcommit, and if we can't then we just need to free up our space
 268          * and not satisfy any requests.
 269          */
 270         used = btrfs_space_info_used(space_info, true);
 271         if (used - num_bytes >= space_info->total_bytes)
 272                 check_overcommit = true;
 273 again:
 274         while (!list_empty(head) && num_bytes) {
 275                 ticket = list_first_entry(head, struct reserve_ticket,
 276                                           list);
 277                 /*
 278                  * We use 0 bytes because this space is already reserved, so
 279                  * adding the ticket space would be a double count.
 280                  */
 281                 if (check_overcommit &&
 282                     !can_overcommit(fs_info, space_info, 0, flush, false))
 283                         break;
 284                 if (num_bytes >= ticket->bytes) {
 285                         list_del_init(&ticket->list);
 286                         num_bytes -= ticket->bytes;
 287                         ticket->bytes = 0;
 288                         space_info->tickets_id++;
 289                         wake_up(&ticket->wait);
 290                 } else {
 291                         ticket->bytes -= num_bytes;
 292                         num_bytes = 0;
 293                 }
 294         }
 295
 296         if (num_bytes && head == &space_info->priority_tickets) {
 297                 head = &space_info->tickets;
 298                 flush = BTRFS_RESERVE_FLUSH_ALL;
 299                 goto again;
 300         }
 301         btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
 302         trace_btrfs_space_reservation(fs_info, "space_info",
 303                                       space_info->flags, num_bytes, 0);
 304         spin_unlock(&space_info->lock);
 305 }
 306
 307 /*
 308  * This is for newly allocated space that isn't accounted in
 309  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
 310  * we use this helper.
 311  */
 312 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 313                                     struct btrfs_space_info *space_info,
 314                                     u64 num_bytes)
 315 {
 316         struct reserve_ticket *ticket;
 317         struct list_head *head = &space_info->priority_tickets;
 318
 319 again:
 320         while (!list_empty(head) && num_bytes) {
 321                 ticket = list_first_entry(head, struct reserve_ticket,
 322                                           list);
 323                 if (num_bytes >= ticket->bytes) {
 324                         trace_btrfs_space_reservation(fs_info, "space_info",
 325                                                       space_info->flags,
 326                                                       ticket->bytes, 1);
 327                         list_del_init(&ticket->list);
 328                         num_bytes -= ticket->bytes;
 329                         btrfs_space_info_update_bytes_may_use(fs_info,
 330                                                               space_info,
 331                                                               ticket->bytes);
 332                         ticket->bytes = 0;
 333                         space_info->tickets_id++;
 334                         wake_up(&ticket->wait);
 335                 } else {
 336                         trace_btrfs_space_reservation(fs_info, "space_info",
 337                                                       space_info->flags,
 338                                                       num_bytes, 1);
 339                         btrfs_space_info_update_bytes_may_use(fs_info,
 340                                                               space_info,
 341                                                               num_bytes);
 342                         ticket->bytes -= num_bytes;
 343                         num_bytes = 0;
 344                 }
 345         }
 346
 347         if (num_bytes && head == &space_info->priority_tickets) {
 348                 head = &space_info->tickets;
 349                 goto again;
 350         }
 351 }
 352
 353 #define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
 354 do {                                                                    \
 355         struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
 356         spin_lock(&__rsv->lock);                                        \
 357         btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
 358                    __rsv->size, __rsv->reserved);                       \
 359         spin_unlock(&__rsv->lock);                                      \
 360 } while (0)
 361
 362 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 363                            struct btrfs_space_info *info, u64 bytes,
 364                            int dump_block_groups)
 365 {
 366         struct btrfs_block_group_cache *cache;
 367         int index = 0;
 368
 369         spin_lock(&info->lock);
 370         btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
 371                    info->flags,
 372                    info->total_bytes - btrfs_space_info_used(info, true),
 373                    info->full ? "" : "not ");
 374         btrfs_info(fs_info,
 375                 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
 376                 info->total_bytes, info->bytes_used, info->bytes_pinned,
 377                 info->bytes_reserved, info->bytes_may_use,
 378                 info->bytes_readonly);
 379         spin_unlock(&info->lock);
 380
 381         DUMP_BLOCK_RSV(fs_info, global_block_rsv);
 382         DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
 383         DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
 384         DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
 385         DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
 386
 387         if (!dump_block_groups)
 388                 return;
 389
 390         down_read(&info->groups_sem);
 391 again:
 392         list_for_each_entry(cache, &info->block_groups[index], list) {
 393                 spin_lock(&cache->lock);
 394                 btrfs_info(fs_info,
 395                         "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
 396                         cache->key.objectid, cache->key.offset,
 397                         btrfs_block_group_used(&cache->item), cache->pinned,
 398                         cache->reserved, cache->ro ? "[readonly]" : "");
 399                 btrfs_dump_free_space(cache, bytes);
 400                 spin_unlock(&cache->lock);
 401         }
 402         if (++index < BTRFS_NR_RAID_TYPES)
 403                 goto again;
 404         up_read(&info->groups_sem);
 405 }
 406
 407 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
 408                                          unsigned long nr_pages, int nr_items)
 409 {
 410         struct super_block *sb = fs_info->sb;
 411
 412         if (down_read_trylock(&sb->s_umount)) {
 413                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
 414                 up_read(&sb->s_umount);
 415         } else {
 416                 /*
 417                  * We needn't worry the filesystem going from r/w to r/o though
 418                  * we don't acquire ->s_umount mutex, because the filesystem
 419                  * should guarantee the delalloc inodes list be empty after
 420                  * the filesystem is readonly(all dirty pages are written to
 421                  * the disk).
 422                  */
 423                 btrfs_start_delalloc_roots(fs_info, nr_items);
 424                 if (!current->journal_info)
 425                         btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
 426         }
 427 }
 428
 429 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 430                                         u64 to_reclaim)
 431 {
 432         u64 bytes;
 433         u64 nr;
 434
 435         bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 436         nr = div64_u64(to_reclaim, bytes);
 437         if (!nr)
 438                 nr = 1;
 439         return nr;
 440 }
 441
 442 #define EXTENT_SIZE_PER_ITEM    SZ_256K
 443
 444 /*
 445  * shrink metadata reservation for delalloc
 446  */
 447 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 448                             u64 orig, bool wait_ordered)
 449 {
 450         struct btrfs_space_info *space_info;
 451         struct btrfs_trans_handle *trans;
 452         u64 delalloc_bytes;
 453         u64 dio_bytes;
 454         u64 async_pages;
 455         u64 items;
 456         long time_left;
 457         unsigned long nr_pages;
 458         int loops;
 459
 460         /* Calc the number of the pages we need flush for space reservation */
 461         items = calc_reclaim_items_nr(fs_info, to_reclaim);
 462         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 463
 464         trans = (struct btrfs_trans_handle *)current->journal_info;
 465         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 466
 467         delalloc_bytes = percpu_counter_sum_positive(
 468                                                 &fs_info->delalloc_bytes);
 469         dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
 470         if (delalloc_bytes == 0 && dio_bytes == 0) {
 471                 if (trans)
 472                         return;
 473                 if (wait_ordered)
 474                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 475                 return;
 476         }
 477
 478         /*
 479          * If we are doing more ordered than delalloc we need to just wait on
 480          * ordered extents, otherwise we'll waste time trying to flush delalloc
 481          * that likely won't give us the space back we need.
 482          */
 483         if (dio_bytes > delalloc_bytes)
 484                 wait_ordered = true;
 485
 486         loops = 0;
 487         while ((delalloc_bytes || dio_bytes) && loops < 3) {
 488                 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
 489
 490                 /*
 491                  * Triggers inode writeback for up to nr_pages. This will invoke
 492                  * ->writepages callback and trigger delalloc filling
 493                  *  (btrfs_run_delalloc_range()).
 494                  */
 495                 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
 496
 497                 /*
 498                  * We need to wait for the compressed pages to start before
 499                  * we continue.
 500                  */
 501                 async_pages = atomic_read(&fs_info->async_delalloc_pages);
 502                 if (!async_pages)
 503                         goto skip_async;
 504
 505                 /*
 506                  * Calculate how many compressed pages we want to be written
 507                  * before we continue. I.e if there are more async pages than we
 508                  * require wait_event will wait until nr_pages are written.
 509                  */
 510                 if (async_pages <= nr_pages)
 511                         async_pages = 0;
 512                 else
 513                         async_pages -= nr_pages;
 514
 515                 wait_event(fs_info->async_submit_wait,
 516                            atomic_read(&fs_info->async_delalloc_pages) <=
 517                            (int)async_pages);
 518 skip_async:
 519                 spin_lock(&space_info->lock);
 520                 if (list_empty(&space_info->tickets) &&
 521                     list_empty(&space_info->priority_tickets)) {
 522                         spin_unlock(&space_info->lock);
 523                         break;
 524                 }
 525                 spin_unlock(&space_info->lock);
 526
 527                 loops++;
 528                 if (wait_ordered && !trans) {
 529                         btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 530                 } else {
 531                         time_left = schedule_timeout_killable(1);
 532                         if (time_left)
 533                                 break;
 534                 }
 535                 delalloc_bytes = percpu_counter_sum_positive(
 536                                                 &fs_info->delalloc_bytes);
 537                 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
 538         }
 539 }
 540
 541 /**
 542  * maybe_commit_transaction - possibly commit the transaction if its ok to
 543  * @root - the root we're allocating for
 544  * @bytes - the number of bytes we want to reserve
 545  * @force - force the commit
 546  *
 547  * This will check to make sure that committing the transaction will actually
 548  * get us somewhere and then commit the transaction if it does.  Otherwise it
 549  * will return -ENOSPC.
 550  */
 551 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 552                                   struct btrfs_space_info *space_info)
 553 {
 554         struct reserve_ticket *ticket = NULL;
 555         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
 556         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 557         struct btrfs_trans_handle *trans;
 558         u64 bytes_needed;
 559         u64 reclaim_bytes = 0;
 560
 561         trans = (struct btrfs_trans_handle *)current->journal_info;
 562         if (trans)
 563                 return -EAGAIN;
 564
 565         spin_lock(&space_info->lock);
 566         if (!list_empty(&space_info->priority_tickets))
 567                 ticket = list_first_entry(&space_info->priority_tickets,
 568                                           struct reserve_ticket, list);
 569         else if (!list_empty(&space_info->tickets))
 570                 ticket = list_first_entry(&space_info->tickets,
 571                                           struct reserve_ticket, list);
 572         bytes_needed = (ticket) ? ticket->bytes : 0;
 573         spin_unlock(&space_info->lock);
 574
 575         if (!bytes_needed)
 576                 return 0;
 577
 578         trans = btrfs_join_transaction(fs_info->extent_root);
 579         if (IS_ERR(trans))
 580                 return PTR_ERR(trans);
 581
 582         /*
 583          * See if there is enough pinned space to make this reservation, or if
 584          * we have block groups that are going to be freed, allowing us to
 585          * possibly do a chunk allocation the next loop through.
 586          */
 587         if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
 588             __percpu_counter_compare(&space_info->total_bytes_pinned,
 589                                      bytes_needed,
 590                                      BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
 591                 goto commit;
 592
 593         /*
 594          * See if there is some space in the delayed insertion reservation for
 595          * this reservation.
 596          */
 597         if (space_info != delayed_rsv->space_info)
 598                 goto enospc;
 599
 600         spin_lock(&delayed_rsv->lock);
 601         reclaim_bytes += delayed_rsv->reserved;
 602         spin_unlock(&delayed_rsv->lock);
 603
 604         spin_lock(&delayed_refs_rsv->lock);
 605         reclaim_bytes += delayed_refs_rsv->reserved;
 606         spin_unlock(&delayed_refs_rsv->lock);
 607         if (reclaim_bytes >= bytes_needed)
 608                 goto commit;
 609         bytes_needed -= reclaim_bytes;
 610
 611         if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 612                                    bytes_needed,
 613                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
 614                 goto enospc;
 615
 616 commit:
 617         return btrfs_commit_transaction(trans);
 618 enospc:
 619         btrfs_end_transaction(trans);
 620         return -ENOSPC;
 621 }
 622
 623 /*
 624  * Try to flush some data based on policy set by @state. This is only advisory
 625  * and may fail for various reasons. The caller is supposed to examine the
 626  * state of @space_info to detect the outcome.
 627  */
 628 static void flush_space(struct btrfs_fs_info *fs_info,
 629                        struct btrfs_space_info *space_info, u64 num_bytes,
 630                        int state)
 631 {
 632         struct btrfs_root *root = fs_info->extent_root;
 633         struct btrfs_trans_handle *trans;
 634         int nr;
 635         int ret = 0;
 636
 637         switch (state) {
 638         case FLUSH_DELAYED_ITEMS_NR:
 639         case FLUSH_DELAYED_ITEMS:
 640                 if (state == FLUSH_DELAYED_ITEMS_NR)
 641                         nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
 642                 else
 643                         nr = -1;
 644
 645                 trans = btrfs_join_transaction(root);
 646                 if (IS_ERR(trans)) {
 647                         ret = PTR_ERR(trans);
 648                         break;
 649                 }
 650                 ret = btrfs_run_delayed_items_nr(trans, nr);
 651                 btrfs_end_transaction(trans);
 652                 break;
 653         case FLUSH_DELALLOC:
 654         case FLUSH_DELALLOC_WAIT:
 655                 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 656                                 state == FLUSH_DELALLOC_WAIT);
 657                 break;
 658         case FLUSH_DELAYED_REFS_NR:
 659         case FLUSH_DELAYED_REFS:
 660                 trans = btrfs_join_transaction(root);
 661                 if (IS_ERR(trans)) {
 662                         ret = PTR_ERR(trans);
 663                         break;
 664                 }
 665                 if (state == FLUSH_DELAYED_REFS_NR)
 666                         nr = calc_reclaim_items_nr(fs_info, num_bytes);
 667                 else
 668                         nr = 0;
 669                 btrfs_run_delayed_refs(trans, nr);
 670                 btrfs_end_transaction(trans);
 671                 break;
 672         case ALLOC_CHUNK:
 673         case ALLOC_CHUNK_FORCE:
 674                 trans = btrfs_join_transaction(root);
 675                 if (IS_ERR(trans)) {
 676                         ret = PTR_ERR(trans);
 677                         break;
 678                 }
 679                 ret = btrfs_chunk_alloc(trans,
 680                                 btrfs_metadata_alloc_profile(fs_info),
 681                                 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
 682                                         CHUNK_ALLOC_FORCE);
 683                 btrfs_end_transaction(trans);
 684                 if (ret > 0 || ret == -ENOSPC)
 685                         ret = 0;
 686                 break;
 687         case COMMIT_TRANS:
 688                 /*
 689                  * If we have pending delayed iputs then we could free up a
 690                  * bunch of pinned space, so make sure we run the iputs before
 691                  * we do our pinned bytes check below.
 692                  */
 693                 btrfs_run_delayed_iputs(fs_info);
 694                 btrfs_wait_on_delayed_iputs(fs_info);
 695
 696                 ret = may_commit_transaction(fs_info, space_info);
 697                 break;
 698         default:
 699                 ret = -ENOSPC;
 700                 break;
 701         }
 702
 703         trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
 704                                 ret);
 705         return;
 706 }
 707
 708 static inline u64
 709 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 710                                  struct btrfs_space_info *space_info,
 711                                  bool system_chunk)
 712 {
 713         struct reserve_ticket *ticket;
 714         u64 used;
 715         u64 expected;
 716         u64 to_reclaim = 0;
 717
 718         list_for_each_entry(ticket, &space_info->tickets, list)
 719                 to_reclaim += ticket->bytes;
 720         list_for_each_entry(ticket, &space_info->priority_tickets, list)
 721                 to_reclaim += ticket->bytes;
 722         if (to_reclaim)
 723                 return to_reclaim;
 724
 725         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 726         if (can_overcommit(fs_info, space_info, to_reclaim,
 727                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 728                 return 0;
 729
 730         used = btrfs_space_info_used(space_info, true);
 731
 732         if (can_overcommit(fs_info, space_info, SZ_1M,
 733                            BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 734                 expected = div_factor_fine(space_info->total_bytes, 95);
 735         else
 736                 expected = div_factor_fine(space_info->total_bytes, 90);
 737
 738         if (used > expected)
 739                 to_reclaim = used - expected;
 740         else
 741                 to_reclaim = 0;
 742         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
 743                                      space_info->bytes_reserved);
 744         return to_reclaim;
 745 }
 746
 747 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 748                                         struct btrfs_space_info *space_info,
 749                                         u64 used, bool system_chunk)
 750 {
 751         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
 752
 753         /* If we're just plain full then async reclaim just slows us down. */
 754         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
 755                 return 0;
 756
 757         if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 758                                               system_chunk))
 759                 return 0;
 760
 761         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
 762                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 763 }
 764
 765 static bool wake_all_tickets(struct list_head *head)
 766 {
 767         struct reserve_ticket *ticket;
 768
 769         while (!list_empty(head)) {
 770                 ticket = list_first_entry(head, struct reserve_ticket, list);
 771                 list_del_init(&ticket->list);
 772                 ticket->error = -ENOSPC;
 773                 wake_up(&ticket->wait);
 774                 if (ticket->bytes != ticket->orig_bytes)
 775                         return true;
 776         }
 777         return false;
 778 }
 779
 780 /*
 781  * This is for normal flushers, we can wait all goddamned day if we want to.  We
 782  * will loop and continuously try to flush as long as we are making progress.
 783  * We count progress as clearing off tickets each time we have to loop.
 784  */
 785 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 786 {
 787         struct btrfs_fs_info *fs_info;
 788         struct btrfs_space_info *space_info;
 789         u64 to_reclaim;
 790         int flush_state;
 791         int commit_cycles = 0;
 792         u64 last_tickets_id;
 793
 794         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 795         space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 796
 797         spin_lock(&space_info->lock);
 798         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 799                                                       false);
 800         if (!to_reclaim) {
 801                 space_info->flush = 0;
 802                 spin_unlock(&space_info->lock);
 803                 return;
 804         }
 805         last_tickets_id = space_info->tickets_id;
 806         spin_unlock(&space_info->lock);
 807
 808         flush_state = FLUSH_DELAYED_ITEMS_NR;
 809         do {
 810                 flush_space(fs_info, space_info, to_reclaim, flush_state);
 811                 spin_lock(&space_info->lock);
 812                 if (list_empty(&space_info->tickets)) {
 813                         space_info->flush = 0;
 814                         spin_unlock(&space_info->lock);
 815                         return;
 816                 }
 817                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
 818                                                               space_info,
 819                                                               false);
 820                 if (last_tickets_id == space_info->tickets_id) {
 821                         flush_state++;
 822                 } else {
 823                         last_tickets_id = space_info->tickets_id;
 824                         flush_state = FLUSH_DELAYED_ITEMS_NR;
 825                         if (commit_cycles)
 826                                 commit_cycles--;
 827                 }
 828
 829                 /*
 830                  * We don't want to force a chunk allocation until we've tried
 831                  * pretty hard to reclaim space.  Think of the case where we
 832                  * freed up a bunch of space and so have a lot of pinned space
 833                  * to reclaim.  We would rather use that than possibly create a
 834                  * underutilized metadata chunk.  So if this is our first run
 835                  * through the flushing state machine skip ALLOC_CHUNK_FORCE and
 836                  * commit the transaction.  If nothing has changed the next go
 837                  * around then we can force a chunk allocation.
 838                  */
 839                 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
 840                         flush_state++;
 841
 842                 if (flush_state > COMMIT_TRANS) {
 843                         commit_cycles++;
 844                         if (commit_cycles > 2) {
 845                                 if (wake_all_tickets(&space_info->tickets)) {
 846                                         flush_state = FLUSH_DELAYED_ITEMS_NR;
 847                                         commit_cycles--;
 848                                 } else {
 849                                         space_info->flush = 0;
 850                                 }
 851                         } else {
 852                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
 853                         }
 854                 }
 855                 spin_unlock(&space_info->lock);
 856         } while (flush_state <= COMMIT_TRANS);
 857 }
 858
 859 void btrfs_init_async_reclaim_work(struct work_struct *work)
 860 {
 861         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
 862 }
 863
 864 static const enum btrfs_flush_state priority_flush_states[] = {
 865         FLUSH_DELAYED_ITEMS_NR,
 866         FLUSH_DELAYED_ITEMS,
 867         ALLOC_CHUNK,
 868 };
 869
 870 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 871                                             struct btrfs_space_info *space_info,
 872                                             struct reserve_ticket *ticket)
 873 {
 874         u64 to_reclaim;
 875         int flush_state;
 876
 877         spin_lock(&space_info->lock);
 878         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 879                                                       false);
 880         if (!to_reclaim) {
 881                 spin_unlock(&space_info->lock);
 882                 return;
 883         }
 884         spin_unlock(&space_info->lock);
 885
 886         flush_state = 0;
 887         do {
 888                 flush_space(fs_info, space_info, to_reclaim,
 889                             priority_flush_states[flush_state]);
 890                 flush_state++;
 891                 spin_lock(&space_info->lock);
 892                 if (ticket->bytes == 0) {
 893                         spin_unlock(&space_info->lock);
 894                         return;
 895                 }
 896                 spin_unlock(&space_info->lock);
 897         } while (flush_state < ARRAY_SIZE(priority_flush_states));
 898 }
 899
 900 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 901                                struct btrfs_space_info *space_info,
 902                                struct reserve_ticket *ticket)
 903
 904 {
 905         DEFINE_WAIT(wait);
 906         u64 reclaim_bytes = 0;
 907         int ret = 0;
 908
 909         spin_lock(&space_info->lock);
 910         while (ticket->bytes > 0 && ticket->error == 0) {
 911                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
 912                 if (ret) {
 913                         ret = -EINTR;
 914                         break;
 915                 }
 916                 spin_unlock(&space_info->lock);
 917
 918                 schedule();
 919
 920                 finish_wait(&ticket->wait, &wait);
 921                 spin_lock(&space_info->lock);
 922         }
 923         if (!ret)
 924                 ret = ticket->error;
 925         if (!list_empty(&ticket->list))
 926                 list_del_init(&ticket->list);
 927         if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
 928                 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
 929         spin_unlock(&space_info->lock);
 930
 931         if (reclaim_bytes)
 932                 btrfs_space_info_add_old_bytes(fs_info, space_info,
 933                                                reclaim_bytes);
 934         return ret;
 935 }
 936
 937 /**
 938  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 939  * @root - the root we're allocating for
 940  * @space_info - the space info we want to allocate from
 941  * @orig_bytes - the number of bytes we want
 942  * @flush - whether or not we can flush to make our reservation
 943  *
 944  * This will reserve orig_bytes number of bytes from the space info associated
 945  * with the block_rsv.  If there is not enough space it will make an attempt to
 946  * flush out space to make room.  It will do this by flushing delalloc if
 947  * possible or committing the transaction.  If flush is 0 then no attempts to
 948  * regain reservations will be made and this will fail if there is not enough
 949  * space already.
 950  */
 951 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 952                                     struct btrfs_space_info *space_info,
 953                                     u64 orig_bytes,
 954                                     enum btrfs_reserve_flush_enum flush,
 955                                     bool system_chunk)
 956 {
 957         struct reserve_ticket ticket;
 958         u64 used;
 959         u64 reclaim_bytes = 0;
 960         int ret = 0;
 961
 962         ASSERT(orig_bytes);
 963         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
 964
 965         spin_lock(&space_info->lock);
 966         ret = -ENOSPC;
 967         used = btrfs_space_info_used(space_info, true);
 968
 969         /*
 970          * Carry on if we have enough space (short-circuit) OR call
 971          * can_overcommit() to ensure we can overcommit to continue.
 972          */
 973         if ((used + orig_bytes <= space_info->total_bytes) ||
 974             can_overcommit(fs_info, space_info, orig_bytes, flush,
 975                            system_chunk)) {
 976                 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
 977                                                       orig_bytes);
 978                 trace_btrfs_space_reservation(fs_info, "space_info",
 979                                               space_info->flags, orig_bytes, 1);
 980                 ret = 0;
 981         }
 982
 983         /*
 984          * If we couldn't make a reservation then setup our reservation ticket
 985          * and kick the async worker if it's not already running.
 986          *
 987          * If we are a priority flusher then we just need to add our ticket to
 988          * the list and we will do our own flushing further down.
 989          */
 990         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
 991                 ticket.orig_bytes = orig_bytes;
 992                 ticket.bytes = orig_bytes;
 993                 ticket.error = 0;
 994                 init_waitqueue_head(&ticket.wait);
 995                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
 996                         list_add_tail(&ticket.list, &space_info->tickets);
 997                         if (!space_info->flush) {
 998                                 space_info->flush = 1;
 999                                 trace_btrfs_trigger_flush(fs_info,
1000                                                           space_info->flags,
1001                                                           orig_bytes, flush,
1002                                                           "enospc");
1003                                 queue_work(system_unbound_wq,
1004                                            &fs_info->async_reclaim_work);
1005                         }
1006                 } else {
1007                         list_add_tail(&ticket.list,
1008                                       &space_info->priority_tickets);
1009                 }
1010         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1011                 used += orig_bytes;
1012                 /*
1013                  * We will do the space reservation dance during log replay,
1014                  * which means we won't have fs_info->fs_root set, so don't do
1015                  * the async reclaim as we will panic.
1016                  */
1017                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1018                     need_do_async_reclaim(fs_info, space_info,
1019                                           used, system_chunk) &&
1020                     !work_busy(&fs_info->async_reclaim_work)) {
1021                         trace_btrfs_trigger_flush(fs_info, space_info->flags,
1022                                                   orig_bytes, flush, "preempt");
1023                         queue_work(system_unbound_wq,
1024                                    &fs_info->async_reclaim_work);
1025                 }
1026         }
1027         spin_unlock(&space_info->lock);
1028         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1029                 return ret;
1030
1031         if (flush == BTRFS_RESERVE_FLUSH_ALL)
1032                 return wait_reserve_ticket(fs_info, space_info, &ticket);
1033
1034         ret = 0;
1035         priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1036         spin_lock(&space_info->lock);
1037         if (ticket.bytes) {
1038                 if (ticket.bytes < orig_bytes)
1039                         reclaim_bytes = orig_bytes - ticket.bytes;
1040                 list_del_init(&ticket.list);
1041                 ret = -ENOSPC;
1042         }
1043         spin_unlock(&space_info->lock);
1044
1045         if (reclaim_bytes)
1046                 btrfs_space_info_add_old_bytes(fs_info, space_info,
1047                                                reclaim_bytes);
1048         ASSERT(list_empty(&ticket.list));
1049         return ret;
1050 }
1051
1052 /**
1053  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1054  * @root - the root we're allocating for
1055  * @block_rsv - the block_rsv we're allocating for
1056  * @orig_bytes - the number of bytes we want
1057  * @flush - whether or not we can flush to make our reservation
1058  *
1059  * This will reserve orig_bytes number of bytes from the space info associated
1060  * with the block_rsv.  If there is not enough space it will make an attempt to
1061  * flush out space to make room.  It will do this by flushing delalloc if
1062  * possible or committing the transaction.  If flush is 0 then no attempts to
1063  * regain reservations will be made and this will fail if there is not enough
1064  * space already.
1065  */
1066 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1067                                  struct btrfs_block_rsv *block_rsv,
1068                                  u64 orig_bytes,
1069                                  enum btrfs_reserve_flush_enum flush)
1070 {
1071         struct btrfs_fs_info *fs_info = root->fs_info;
1072         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1073         int ret;
1074         bool system_chunk = (root == fs_info->chunk_root);
1075
1076         ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1077                                        orig_bytes, flush, system_chunk);
1078         if (ret == -ENOSPC &&
1079             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1080                 if (block_rsv != global_rsv &&
1081                     !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1082                         ret = 0;
1083         }
1084         if (ret == -ENOSPC) {
1085                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1086                                               block_rsv->space_info->flags,
1087                                               orig_bytes, 1);
1088
1089                 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1090                         btrfs_dump_space_info(fs_info, block_rsv->space_info,
1091                                               orig_bytes, 0);
1092         }
1093         return ret;
1094 }