fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25
  26 /*
  27  * Default IO end handler for temporary BJ_IO buffer_heads.
  28  */
  29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30 {
  31         BUFFER_TRACE(bh, "");
  32         if (uptodate)
  33                 set_buffer_uptodate(bh);
  34         else
  35                 clear_buffer_uptodate(bh);
  36         unlock_buffer(bh);
  37 }
  38
  39 /*
  40  * When an ext3-ordered file is truncated, it is possible that many pages are
  41  * not sucessfully freed, because they are attached to a committing transaction.
  42  * After the transaction commits, these pages are left on the LRU, with no
  43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45  * the numbers in /proc/meminfo look odd.
  46  *
  47  * So here, we have a buffer which has just come off the forget list.  Look to
  48  * see if we can strip all buffers from the backing page.
  49  *
  50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51  * caller provided us with a ref against the buffer, and we drop that here.
  52  */
  53 static void release_buffer_page(struct buffer_head *bh)
  54 {
  55         struct page *page;
  56
  57         if (buffer_dirty(bh))
  58                 goto nope;
  59         if (atomic_read(&bh->b_count) != 1)
  60                 goto nope;
  61         page = bh->b_page;
  62         if (!page)
  63                 goto nope;
  64         if (page->mapping)
  65                 goto nope;
  66
  67         /* OK, it's a truncated page */
  68         if (TestSetPageLocked(page))
  69                 goto nope;
  70
  71         page_cache_get(page);
  72         __brelse(bh);
  73         try_to_free_buffers(page);
  74         unlock_page(page);
  75         page_cache_release(page);
  76         return;
  77
  78 nope:
  79         __brelse(bh);
  80 }
  81
  82 /*
  83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  85  * return 0.  j_list_lock is dropped in this case.
  86  */
  87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  88 {
  89         if (!jbd_trylock_bh_state(bh)) {
  90                 spin_unlock(&journal->j_list_lock);
  91                 schedule();
  92                 return 0;
  93         }
  94         return 1;
  95 }
  96
  97 /*
  98  * Done it all: now submit the commit record.  We should have
  99  * cleaned up our previous buffers by now, so if we are in abort
 100  * mode we can now just skip the rest of the journal write
 101  * entirely.
 102  *
 103  * Returns 1 if the journal needs to be aborted or 0 on success
 104  */
 105 static int journal_submit_commit_record(journal_t *journal,
 106                                         transaction_t *commit_transaction,
 107                                         struct buffer_head **cbh,
 108                                         __u32 crc32_sum)
 109 {
 110         struct journal_head *descriptor;
 111         struct commit_header *tmp;
 112         struct buffer_head *bh;
 113         int ret;
 114         int barrier_done = 0;
 115
 116         if (is_journal_aborted(journal))
 117                 return 0;
 118
 119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 120         if (!descriptor)
 121                 return 1;
 122
 123         bh = jh2bh(descriptor);
 124
 125         tmp = (struct commit_header *)bh->b_data;
 126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 129
 130         if (JBD2_HAS_COMPAT_FEATURE(journal,
 131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 135         }
 136
 137         JBUFFER_TRACE(descriptor, "submit commit block");
 138         lock_buffer(bh);
 139         get_bh(bh);
 140         set_buffer_dirty(bh);
 141         set_buffer_uptodate(bh);
 142         bh->b_end_io = journal_end_buffer_io_sync;
 143
 144         if (journal->j_flags & JBD2_BARRIER &&
 145                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 147                 set_buffer_ordered(bh);
 148                 barrier_done = 1;
 149         }
 150         ret = submit_bh(WRITE, bh);
 151
 152         /* is it possible for another commit to fail at roughly
 153          * the same time as this one?  If so, we don't want to
 154          * trust the barrier flag in the super, but instead want
 155          * to remember if we sent a barrier request
 156          */
 157         if (ret == -EOPNOTSUPP && barrier_done) {
 158                 char b[BDEVNAME_SIZE];
 159
 160                 printk(KERN_WARNING
 161                         "JBD: barrier-based sync failed on %s - "
 162                         "disabling barriers\n",
 163                         bdevname(journal->j_dev, b));
 164                 spin_lock(&journal->j_state_lock);
 165                 journal->j_flags &= ~JBD2_BARRIER;
 166                 spin_unlock(&journal->j_state_lock);
 167
 168                 /* And try again, without the barrier */
 169                 clear_buffer_ordered(bh);
 170                 set_buffer_uptodate(bh);
 171                 set_buffer_dirty(bh);
 172                 ret = submit_bh(WRITE, bh);
 173         }
 174         *cbh = bh;
 175         return ret;
 176 }
 177
 178 /*
 179  * This function along with journal_submit_commit_record
 180  * allows to write the commit record asynchronously.
 181  */
 182 static int journal_wait_on_commit_record(struct buffer_head *bh)
 183 {
 184         int ret = 0;
 185
 186         clear_buffer_dirty(bh);
 187         wait_on_buffer(bh);
 188
 189         if (unlikely(!buffer_uptodate(bh)))
 190                 ret = -EIO;
 191         put_bh(bh);            /* One for getblk() */
 192         jbd2_journal_put_journal_head(bh2jh(bh));
 193
 194         return ret;
 195 }
 196
 197 /*
 198  * Wait for all submitted IO to complete.
 199  */
 200 static int journal_wait_on_locked_list(journal_t *journal,
 201                                        transaction_t *commit_transaction)
 202 {
 203         int ret = 0;
 204         struct journal_head *jh;
 205
 206         while (commit_transaction->t_locked_list) {
 207                 struct buffer_head *bh;
 208
 209                 jh = commit_transaction->t_locked_list->b_tprev;
 210                 bh = jh2bh(jh);
 211                 get_bh(bh);
 212                 if (buffer_locked(bh)) {
 213                         spin_unlock(&journal->j_list_lock);
 214                         wait_on_buffer(bh);
 215                         if (unlikely(!buffer_uptodate(bh)))
 216                                 ret = -EIO;
 217                         spin_lock(&journal->j_list_lock);
 218                 }
 219                 if (!inverted_lock(journal, bh)) {
 220                         put_bh(bh);
 221                         spin_lock(&journal->j_list_lock);
 222                         continue;
 223                 }
 224                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 225                         __jbd2_journal_unfile_buffer(jh);
 226                         jbd_unlock_bh_state(bh);
 227                         jbd2_journal_remove_journal_head(bh);
 228                         put_bh(bh);
 229                 } else {
 230                         jbd_unlock_bh_state(bh);
 231                 }
 232                 put_bh(bh);
 233                 cond_resched_lock(&journal->j_list_lock);
 234         }
 235         return ret;
 236   }
 237
 238 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 239 {
 240         int i;
 241
 242         for (i = 0; i < bufs; i++) {
 243                 wbuf[i]->b_end_io = end_buffer_write_sync;
 244                 /* We use-up our safety reference in submit_bh() */
 245                 submit_bh(WRITE, wbuf[i]);
 246         }
 247 }
 248
 249 /*
 250  *  Submit all the data buffers to disk
 251  */
 252 static void journal_submit_data_buffers(journal_t *journal,
 253                                 transaction_t *commit_transaction)
 254 {
 255         struct journal_head *jh;
 256         struct buffer_head *bh;
 257         int locked;
 258         int bufs = 0;
 259         struct buffer_head **wbuf = journal->j_wbuf;
 260
 261         /*
 262          * Whenever we unlock the journal and sleep, things can get added
 263          * onto ->t_sync_datalist, so we have to keep looping back to
 264          * write_out_data until we *know* that the list is empty.
 265          *
 266          * Cleanup any flushed data buffers from the data list.  Even in
 267          * abort mode, we want to flush this out as soon as possible.
 268          */
 269 write_out_data:
 270         cond_resched();
 271         spin_lock(&journal->j_list_lock);
 272
 273         while (commit_transaction->t_sync_datalist) {
 274                 jh = commit_transaction->t_sync_datalist;
 275                 bh = jh2bh(jh);
 276                 locked = 0;
 277
 278                 /* Get reference just to make sure buffer does not disappear
 279                  * when we are forced to drop various locks */
 280                 get_bh(bh);
 281                 /* If the buffer is dirty, we need to submit IO and hence
 282                  * we need the buffer lock. We try to lock the buffer without
 283                  * blocking. If we fail, we need to drop j_list_lock and do
 284                  * blocking lock_buffer().
 285                  */
 286                 if (buffer_dirty(bh)) {
 287                         if (test_set_buffer_locked(bh)) {
 288                                 BUFFER_TRACE(bh, "needs blocking lock");
 289                                 spin_unlock(&journal->j_list_lock);
 290                                 /* Write out all data to prevent deadlocks */
 291                                 journal_do_submit_data(wbuf, bufs);
 292                                 bufs = 0;
 293                                 lock_buffer(bh);
 294                                 spin_lock(&journal->j_list_lock);
 295                         }
 296                         locked = 1;
 297                 }
 298                 /* We have to get bh_state lock. Again out of order, sigh. */
 299                 if (!inverted_lock(journal, bh)) {
 300                         jbd_lock_bh_state(bh);
 301                         spin_lock(&journal->j_list_lock);
 302                 }
 303                 /* Someone already cleaned up the buffer? */
 304                 if (!buffer_jbd(bh)
 305                         || jh->b_transaction != commit_transaction
 306                         || jh->b_jlist != BJ_SyncData) {
 307                         jbd_unlock_bh_state(bh);
 308                         if (locked)
 309                                 unlock_buffer(bh);
 310                         BUFFER_TRACE(bh, "already cleaned up");
 311                         put_bh(bh);
 312                         continue;
 313                 }
 314                 if (locked && test_clear_buffer_dirty(bh)) {
 315                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 316                         wbuf[bufs++] = bh;
 317                         __jbd2_journal_file_buffer(jh, commit_transaction,
 318                                                 BJ_Locked);
 319                         jbd_unlock_bh_state(bh);
 320                         if (bufs == journal->j_wbufsize) {
 321                                 spin_unlock(&journal->j_list_lock);
 322                                 journal_do_submit_data(wbuf, bufs);
 323                                 bufs = 0;
 324                                 goto write_out_data;
 325                         }
 326                 } else if (!locked && buffer_locked(bh)) {
 327                         __jbd2_journal_file_buffer(jh, commit_transaction,
 328                                                 BJ_Locked);
 329                         jbd_unlock_bh_state(bh);
 330                         put_bh(bh);
 331                 } else {
 332                         BUFFER_TRACE(bh, "writeout complete: unfile");
 333                         __jbd2_journal_unfile_buffer(jh);
 334                         jbd_unlock_bh_state(bh);
 335                         if (locked)
 336                                 unlock_buffer(bh);
 337                         jbd2_journal_remove_journal_head(bh);
 338                         /* Once for our safety reference, once for
 339                          * jbd2_journal_remove_journal_head() */
 340                         put_bh(bh);
 341                         put_bh(bh);
 342                 }
 343
 344                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 345                         spin_unlock(&journal->j_list_lock);
 346                         goto write_out_data;
 347                 }
 348         }
 349         spin_unlock(&journal->j_list_lock);
 350         journal_do_submit_data(wbuf, bufs);
 351 }
 352
 353 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 354 {
 355         struct page *page = bh->b_page;
 356         char *addr;
 357         __u32 checksum;
 358
 359         addr = kmap_atomic(page, KM_USER0);
 360         checksum = crc32_be(crc32_sum,
 361                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 362         kunmap_atomic(addr, KM_USER0);
 363
 364         return checksum;
 365 }
 366
 367 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 368                                    unsigned long long block)
 369 {
 370         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 371         if (tag_bytes > JBD2_TAG_SIZE32)
 372                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 373 }
 374
 375 /*
 376  * jbd2_journal_commit_transaction
 377  *
 378  * The primary function for committing a transaction to the log.  This
 379  * function is called by the journal thread to begin a complete commit.
 380  */
 381 void jbd2_journal_commit_transaction(journal_t *journal)
 382 {
 383         struct transaction_stats_s stats;
 384         transaction_t *commit_transaction;
 385         struct journal_head *jh, *new_jh, *descriptor;
 386         struct buffer_head **wbuf = journal->j_wbuf;
 387         int bufs;
 388         int flags;
 389         int err;
 390         unsigned long long blocknr;
 391         char *tagp = NULL;
 392         journal_header_t *header;
 393         journal_block_tag_t *tag = NULL;
 394         int space_left = 0;
 395         int first_tag = 0;
 396         int tag_flag;
 397         int i;
 398         int tag_bytes = journal_tag_bytes(journal);
 399         struct buffer_head *cbh = NULL; /* For transactional checksums */
 400         __u32 crc32_sum = ~0;
 401
 402         /*
 403          * First job: lock down the current transaction and wait for
 404          * all outstanding updates to complete.
 405          */
 406
 407 #ifdef COMMIT_STATS
 408         spin_lock(&journal->j_list_lock);
 409         summarise_journal_usage(journal);
 410         spin_unlock(&journal->j_list_lock);
 411 #endif
 412
 413         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 414         if (journal->j_flags & JBD2_FLUSHED) {
 415                 jbd_debug(3, "super block updated\n");
 416                 jbd2_journal_update_superblock(journal, 1);
 417         } else {
 418                 jbd_debug(3, "superblock not updated\n");
 419         }
 420
 421         J_ASSERT(journal->j_running_transaction != NULL);
 422         J_ASSERT(journal->j_committing_transaction == NULL);
 423
 424         commit_transaction = journal->j_running_transaction;
 425         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 426
 427         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 428                         commit_transaction->t_tid);
 429
 430         spin_lock(&journal->j_state_lock);
 431         commit_transaction->t_state = T_LOCKED;
 432
 433         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 434         stats.u.run.rs_locked = jiffies;
 435         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 436                                                 stats.u.run.rs_locked);
 437
 438         spin_lock(&commit_transaction->t_handle_lock);
 439         while (commit_transaction->t_updates) {
 440                 DEFINE_WAIT(wait);
 441
 442                 prepare_to_wait(&journal->j_wait_updates, &wait,
 443                                         TASK_UNINTERRUPTIBLE);
 444                 if (commit_transaction->t_updates) {
 445                         spin_unlock(&commit_transaction->t_handle_lock);
 446                         spin_unlock(&journal->j_state_lock);
 447                         schedule();
 448                         spin_lock(&journal->j_state_lock);
 449                         spin_lock(&commit_transaction->t_handle_lock);
 450                 }
 451                 finish_wait(&journal->j_wait_updates, &wait);
 452         }
 453         spin_unlock(&commit_transaction->t_handle_lock);
 454
 455         J_ASSERT (commit_transaction->t_outstanding_credits <=
 456                         journal->j_max_transaction_buffers);
 457
 458         /*
 459          * First thing we are allowed to do is to discard any remaining
 460          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 461          * that there are no such buffers: if a large filesystem
 462          * operation like a truncate needs to split itself over multiple
 463          * transactions, then it may try to do a jbd2_journal_restart() while
 464          * there are still BJ_Reserved buffers outstanding.  These must
 465          * be released cleanly from the current transaction.
 466          *
 467          * In this case, the filesystem must still reserve write access
 468          * again before modifying the buffer in the new transaction, but
 469          * we do not require it to remember exactly which old buffers it
 470          * has reserved.  This is consistent with the existing behaviour
 471          * that multiple jbd2_journal_get_write_access() calls to the same
 472          * buffer are perfectly permissable.
 473          */
 474         while (commit_transaction->t_reserved_list) {
 475                 jh = commit_transaction->t_reserved_list;
 476                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 477                 /*
 478                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 479                  * leave undo-committed data.
 480                  */
 481                 if (jh->b_committed_data) {
 482                         struct buffer_head *bh = jh2bh(jh);
 483
 484                         jbd_lock_bh_state(bh);
 485                         jbd2_free(jh->b_committed_data, bh->b_size);
 486                         jh->b_committed_data = NULL;
 487                         jbd_unlock_bh_state(bh);
 488                 }
 489                 jbd2_journal_refile_buffer(journal, jh);
 490         }
 491
 492         /*
 493          * Now try to drop any written-back buffers from the journal's
 494          * checkpoint lists.  We do this *before* commit because it potentially
 495          * frees some memory
 496          */
 497         spin_lock(&journal->j_list_lock);
 498         __jbd2_journal_clean_checkpoint_list(journal);
 499         spin_unlock(&journal->j_list_lock);
 500
 501         jbd_debug (3, "JBD: commit phase 1\n");
 502
 503         /*
 504          * Switch to a new revoke table.
 505          */
 506         jbd2_journal_switch_revoke_table(journal);
 507
 508         stats.u.run.rs_flushing = jiffies;
 509         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 510                                                stats.u.run.rs_flushing);
 511
 512         commit_transaction->t_state = T_FLUSH;
 513         journal->j_committing_transaction = commit_transaction;
 514         journal->j_running_transaction = NULL;
 515         commit_transaction->t_log_start = journal->j_head;
 516         wake_up(&journal->j_wait_transaction_locked);
 517         spin_unlock(&journal->j_state_lock);
 518
 519         jbd_debug (3, "JBD: commit phase 2\n");
 520
 521         /*
 522          * First, drop modified flag: all accesses to the buffers
 523          * will be tracked for a new trasaction only -bzzz
 524          */
 525         spin_lock(&journal->j_list_lock);
 526         if (commit_transaction->t_buffers) {
 527                 new_jh = jh = commit_transaction->t_buffers->b_tnext;
 528                 do {
 529                         J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
 530                                         new_jh->b_modified == 0);
 531                         new_jh->b_modified = 0;
 532                         new_jh = new_jh->b_tnext;
 533                 } while (new_jh != jh);
 534         }
 535         spin_unlock(&journal->j_list_lock);
 536
 537         /*
 538          * Now start flushing things to disk, in the order they appear
 539          * on the transaction lists.  Data blocks go first.
 540          */
 541         err = 0;
 542         journal_submit_data_buffers(journal, commit_transaction);
 543
 544         /*
 545          * Wait for all previously submitted IO to complete if commit
 546          * record is to be written synchronously.
 547          */
 548         spin_lock(&journal->j_list_lock);
 549         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 550                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 551                 err = journal_wait_on_locked_list(journal,
 552                                                 commit_transaction);
 553
 554         spin_unlock(&journal->j_list_lock);
 555
 556         if (err)
 557                 jbd2_journal_abort(journal, err);
 558
 559         jbd2_journal_write_revoke_records(journal, commit_transaction);
 560
 561         jbd_debug(3, "JBD: commit phase 2\n");
 562
 563         /*
 564          * If we found any dirty or locked buffers, then we should have
 565          * looped back up to the write_out_data label.  If there weren't
 566          * any then journal_clean_data_list should have wiped the list
 567          * clean by now, so check that it is in fact empty.
 568          */
 569         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 570
 571         jbd_debug (3, "JBD: commit phase 3\n");
 572
 573         /*
 574          * Way to go: we have now written out all of the data for a
 575          * transaction!  Now comes the tricky part: we need to write out
 576          * metadata.  Loop over the transaction's entire buffer list:
 577          */
 578         commit_transaction->t_state = T_COMMIT;
 579
 580         stats.u.run.rs_logging = jiffies;
 581         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 582                                                  stats.u.run.rs_logging);
 583         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 584         stats.u.run.rs_blocks_logged = 0;
 585
 586         descriptor = NULL;
 587         bufs = 0;
 588         while (commit_transaction->t_buffers) {
 589
 590                 /* Find the next buffer to be journaled... */
 591
 592                 jh = commit_transaction->t_buffers;
 593
 594                 /* If we're in abort mode, we just un-journal the buffer and
 595                    release it for background writing. */
 596
 597                 if (is_journal_aborted(journal)) {
 598                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 599                         jbd2_journal_refile_buffer(journal, jh);
 600                         /* If that was the last one, we need to clean up
 601                          * any descriptor buffers which may have been
 602                          * already allocated, even if we are now
 603                          * aborting. */
 604                         if (!commit_transaction->t_buffers)
 605                                 goto start_journal_io;
 606                         continue;
 607                 }
 608
 609                 /* Make sure we have a descriptor block in which to
 610                    record the metadata buffer. */
 611
 612                 if (!descriptor) {
 613                         struct buffer_head *bh;
 614
 615                         J_ASSERT (bufs == 0);
 616
 617                         jbd_debug(4, "JBD: get descriptor\n");
 618
 619                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 620                         if (!descriptor) {
 621                                 jbd2_journal_abort(journal, -EIO);
 622                                 continue;
 623                         }
 624
 625                         bh = jh2bh(descriptor);
 626                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 627                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 628                         header = (journal_header_t *)&bh->b_data[0];
 629                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 630                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 631                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 632
 633                         tagp = &bh->b_data[sizeof(journal_header_t)];
 634                         space_left = bh->b_size - sizeof(journal_header_t);
 635                         first_tag = 1;
 636                         set_buffer_jwrite(bh);
 637                         set_buffer_dirty(bh);
 638                         wbuf[bufs++] = bh;
 639
 640                         /* Record it so that we can wait for IO
 641                            completion later */
 642                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 643                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 644                                         BJ_LogCtl);
 645                 }
 646
 647                 /* Where is the buffer to be written? */
 648
 649                 err = jbd2_journal_next_log_block(journal, &blocknr);
 650                 /* If the block mapping failed, just abandon the buffer
 651                    and repeat this loop: we'll fall into the
 652                    refile-on-abort condition above. */
 653                 if (err) {
 654                         jbd2_journal_abort(journal, err);
 655                         continue;
 656                 }
 657
 658                 /*
 659                  * start_this_handle() uses t_outstanding_credits to determine
 660                  * the free space in the log, but this counter is changed
 661                  * by jbd2_journal_next_log_block() also.
 662                  */
 663                 commit_transaction->t_outstanding_credits--;
 664
 665                 /* Bump b_count to prevent truncate from stumbling over
 666                    the shadowed buffer!  @@@ This can go if we ever get
 667                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 668                 atomic_inc(&jh2bh(jh)->b_count);
 669
 670                 /* Make a temporary IO buffer with which to write it out
 671                    (this will requeue both the metadata buffer and the
 672                    temporary IO buffer). new_bh goes on BJ_IO*/
 673
 674                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 675                 /*
 676                  * akpm: jbd2_journal_write_metadata_buffer() sets
 677                  * new_bh->b_transaction to commit_transaction.
 678                  * We need to clean this up before we release new_bh
 679                  * (which is of type BJ_IO)
 680                  */
 681                 JBUFFER_TRACE(jh, "ph3: write metadata");
 682                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 683                                                       jh, &new_jh, blocknr);
 684                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 685                 wbuf[bufs++] = jh2bh(new_jh);
 686
 687                 /* Record the new block's tag in the current descriptor
 688                    buffer */
 689
 690                 tag_flag = 0;
 691                 if (flags & 1)
 692                         tag_flag |= JBD2_FLAG_ESCAPE;
 693                 if (!first_tag)
 694                         tag_flag |= JBD2_FLAG_SAME_UUID;
 695
 696                 tag = (journal_block_tag_t *) tagp;
 697                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 698                 tag->t_flags = cpu_to_be32(tag_flag);
 699                 tagp += tag_bytes;
 700                 space_left -= tag_bytes;
 701
 702                 if (first_tag) {
 703                         memcpy (tagp, journal->j_uuid, 16);
 704                         tagp += 16;
 705                         space_left -= 16;
 706                         first_tag = 0;
 707                 }
 708
 709                 /* If there's no more to do, or if the descriptor is full,
 710                    let the IO rip! */
 711
 712                 if (bufs == journal->j_wbufsize ||
 713                     commit_transaction->t_buffers == NULL ||
 714                     space_left < tag_bytes + 16) {
 715
 716                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 717
 718                         /* Write an end-of-descriptor marker before
 719                            submitting the IOs.  "tag" still points to
 720                            the last tag we set up. */
 721
 722                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 723
 724 start_journal_io:
 725                         for (i = 0; i < bufs; i++) {
 726                                 struct buffer_head *bh = wbuf[i];
 727                                 /*
 728                                  * Compute checksum.
 729                                  */
 730                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 731                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 732                                         crc32_sum =
 733                                             jbd2_checksum_data(crc32_sum, bh);
 734                                 }
 735
 736                                 lock_buffer(bh);
 737                                 clear_buffer_dirty(bh);
 738                                 set_buffer_uptodate(bh);
 739                                 bh->b_end_io = journal_end_buffer_io_sync;
 740                                 submit_bh(WRITE, bh);
 741                         }
 742                         cond_resched();
 743                         stats.u.run.rs_blocks_logged += bufs;
 744
 745                         /* Force a new descriptor to be generated next
 746                            time round the loop. */
 747                         descriptor = NULL;
 748                         bufs = 0;
 749                 }
 750         }
 751
 752         /* Done it all: now write the commit record asynchronously. */
 753
 754         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 755                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 756                 err = journal_submit_commit_record(journal, commit_transaction,
 757                                                  &cbh, crc32_sum);
 758                 if (err)
 759                         __jbd2_journal_abort_hard(journal);
 760
 761                 spin_lock(&journal->j_list_lock);
 762                 err = journal_wait_on_locked_list(journal,
 763                                                 commit_transaction);
 764                 spin_unlock(&journal->j_list_lock);
 765                 if (err)
 766                         __jbd2_journal_abort_hard(journal);
 767         }
 768
 769         /* Lo and behold: we have just managed to send a transaction to
 770            the log.  Before we can commit it, wait for the IO so far to
 771            complete.  Control buffers being written are on the
 772            transaction's t_log_list queue, and metadata buffers are on
 773            the t_iobuf_list queue.
 774
 775            Wait for the buffers in reverse order.  That way we are
 776            less likely to be woken up until all IOs have completed, and
 777            so we incur less scheduling load.
 778         */
 779
 780         jbd_debug(3, "JBD: commit phase 4\n");
 781
 782         /*
 783          * akpm: these are BJ_IO, and j_list_lock is not needed.
 784          * See __journal_try_to_free_buffer.
 785          */
 786 wait_for_iobuf:
 787         while (commit_transaction->t_iobuf_list != NULL) {
 788                 struct buffer_head *bh;
 789
 790                 jh = commit_transaction->t_iobuf_list->b_tprev;
 791                 bh = jh2bh(jh);
 792                 if (buffer_locked(bh)) {
 793                         wait_on_buffer(bh);
 794                         goto wait_for_iobuf;
 795                 }
 796                 if (cond_resched())
 797                         goto wait_for_iobuf;
 798
 799                 if (unlikely(!buffer_uptodate(bh)))
 800                         err = -EIO;
 801
 802                 clear_buffer_jwrite(bh);
 803
 804                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 805                 jbd2_journal_unfile_buffer(journal, jh);
 806
 807                 /*
 808                  * ->t_iobuf_list should contain only dummy buffer_heads
 809                  * which were created by jbd2_journal_write_metadata_buffer().
 810                  */
 811                 BUFFER_TRACE(bh, "dumping temporary bh");
 812                 jbd2_journal_put_journal_head(jh);
 813                 __brelse(bh);
 814                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 815                 free_buffer_head(bh);
 816
 817                 /* We also have to unlock and free the corresponding
 818                    shadowed buffer */
 819                 jh = commit_transaction->t_shadow_list->b_tprev;
 820                 bh = jh2bh(jh);
 821                 clear_bit(BH_JWrite, &bh->b_state);
 822                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 823
 824                 /* The metadata is now released for reuse, but we need
 825                    to remember it against this transaction so that when
 826                    we finally commit, we can do any checkpointing
 827                    required. */
 828                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 829                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 830                 /* Wake up any transactions which were waiting for this
 831                    IO to complete */
 832                 wake_up_bit(&bh->b_state, BH_Unshadow);
 833                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 834                 __brelse(bh);
 835         }
 836
 837         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 838
 839         jbd_debug(3, "JBD: commit phase 5\n");
 840
 841         /* Here we wait for the revoke record and descriptor record buffers */
 842  wait_for_ctlbuf:
 843         while (commit_transaction->t_log_list != NULL) {
 844                 struct buffer_head *bh;
 845
 846                 jh = commit_transaction->t_log_list->b_tprev;
 847                 bh = jh2bh(jh);
 848                 if (buffer_locked(bh)) {
 849                         wait_on_buffer(bh);
 850                         goto wait_for_ctlbuf;
 851                 }
 852                 if (cond_resched())
 853                         goto wait_for_ctlbuf;
 854
 855                 if (unlikely(!buffer_uptodate(bh)))
 856                         err = -EIO;
 857
 858                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 859                 clear_buffer_jwrite(bh);
 860                 jbd2_journal_unfile_buffer(journal, jh);
 861                 jbd2_journal_put_journal_head(jh);
 862                 __brelse(bh);           /* One for getblk */
 863                 /* AKPM: bforget here */
 864         }
 865
 866         jbd_debug(3, "JBD: commit phase 6\n");
 867
 868         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 869                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 870                 err = journal_submit_commit_record(journal, commit_transaction,
 871                                                 &cbh, crc32_sum);
 872                 if (err)
 873                         __jbd2_journal_abort_hard(journal);
 874         }
 875         if (!err && !is_journal_aborted(journal))
 876                 err = journal_wait_on_commit_record(cbh);
 877
 878         if (err)
 879                 jbd2_journal_abort(journal, err);
 880
 881         /* End of a transaction!  Finally, we can do checkpoint
 882            processing: any buffers committed as a result of this
 883            transaction can be removed from any checkpoint list it was on
 884            before. */
 885
 886         jbd_debug(3, "JBD: commit phase 7\n");
 887
 888         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 889         J_ASSERT(commit_transaction->t_buffers == NULL);
 890         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 891         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 892         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 893         J_ASSERT(commit_transaction->t_log_list == NULL);
 894
 895 restart_loop:
 896         /*
 897          * As there are other places (journal_unmap_buffer()) adding buffers
 898          * to this list we have to be careful and hold the j_list_lock.
 899          */
 900         spin_lock(&journal->j_list_lock);
 901         while (commit_transaction->t_forget) {
 902                 transaction_t *cp_transaction;
 903                 struct buffer_head *bh;
 904
 905                 jh = commit_transaction->t_forget;
 906                 spin_unlock(&journal->j_list_lock);
 907                 bh = jh2bh(jh);
 908                 jbd_lock_bh_state(bh);
 909                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 910                         jh->b_transaction == journal->j_running_transaction);
 911
 912                 /*
 913                  * If there is undo-protected committed data against
 914                  * this buffer, then we can remove it now.  If it is a
 915                  * buffer needing such protection, the old frozen_data
 916                  * field now points to a committed version of the
 917                  * buffer, so rotate that field to the new committed
 918                  * data.
 919                  *
 920                  * Otherwise, we can just throw away the frozen data now.
 921                  */
 922                 if (jh->b_committed_data) {
 923                         jbd2_free(jh->b_committed_data, bh->b_size);
 924                         jh->b_committed_data = NULL;
 925                         if (jh->b_frozen_data) {
 926                                 jh->b_committed_data = jh->b_frozen_data;
 927                                 jh->b_frozen_data = NULL;
 928                         }
 929                 } else if (jh->b_frozen_data) {
 930                         jbd2_free(jh->b_frozen_data, bh->b_size);
 931                         jh->b_frozen_data = NULL;
 932                 }
 933
 934                 spin_lock(&journal->j_list_lock);
 935                 cp_transaction = jh->b_cp_transaction;
 936                 if (cp_transaction) {
 937                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 938                         cp_transaction->t_chp_stats.cs_dropped++;
 939                         __jbd2_journal_remove_checkpoint(jh);
 940                 }
 941
 942                 /* Only re-checkpoint the buffer_head if it is marked
 943                  * dirty.  If the buffer was added to the BJ_Forget list
 944                  * by jbd2_journal_forget, it may no longer be dirty and
 945                  * there's no point in keeping a checkpoint record for
 946                  * it. */
 947
 948                 /* A buffer which has been freed while still being
 949                  * journaled by a previous transaction may end up still
 950                  * being dirty here, but we want to avoid writing back
 951                  * that buffer in the future now that the last use has
 952                  * been committed.  That's not only a performance gain,
 953                  * it also stops aliasing problems if the buffer is left
 954                  * behind for writeback and gets reallocated for another
 955                  * use in a different page. */
 956                 if (buffer_freed(bh)) {
 957                         clear_buffer_freed(bh);
 958                         clear_buffer_jbddirty(bh);
 959                 }
 960
 961                 if (buffer_jbddirty(bh)) {
 962                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 963                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 964                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 965                         __jbd2_journal_refile_buffer(jh);
 966                         jbd_unlock_bh_state(bh);
 967                 } else {
 968                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 969                         /* The buffer on BJ_Forget list and not jbddirty means
 970                          * it has been freed by this transaction and hence it
 971                          * could not have been reallocated until this
 972                          * transaction has committed. *BUT* it could be
 973                          * reallocated once we have written all the data to
 974                          * disk and before we process the buffer on BJ_Forget
 975                          * list. */
 976                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 977                         __jbd2_journal_refile_buffer(jh);
 978                         if (!jh->b_transaction) {
 979                                 jbd_unlock_bh_state(bh);
 980                                  /* needs a brelse */
 981                                 jbd2_journal_remove_journal_head(bh);
 982                                 release_buffer_page(bh);
 983                         } else
 984                                 jbd_unlock_bh_state(bh);
 985                 }
 986                 cond_resched_lock(&journal->j_list_lock);
 987         }
 988         spin_unlock(&journal->j_list_lock);
 989         /*
 990          * This is a bit sleazy.  We use j_list_lock to protect transition
 991          * of a transaction into T_FINISHED state and calling
 992          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 993          * other checkpointing code processing the transaction...
 994          */
 995         spin_lock(&journal->j_state_lock);
 996         spin_lock(&journal->j_list_lock);
 997         /*
 998          * Now recheck if some buffers did not get attached to the transaction
 999          * while the lock was dropped...
1000          */
1001         if (commit_transaction->t_forget) {
1002                 spin_unlock(&journal->j_list_lock);
1003                 spin_unlock(&journal->j_state_lock);
1004                 goto restart_loop;
1005         }
1006
1007         /* Done with this transaction! */
1008
1009         jbd_debug(3, "JBD: commit phase 8\n");
1010
1011         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1012
1013         commit_transaction->t_start = jiffies;
1014         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1015                                                 commit_transaction->t_start);
1016
1017         /*
1018          * File the transaction for history
1019          */
1020         stats.ts_type = JBD2_STATS_RUN;
1021         stats.ts_tid = commit_transaction->t_tid;
1022         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1023         spin_lock(&journal->j_history_lock);
1024         memcpy(journal->j_history + journal->j_history_cur, &stats,
1025                         sizeof(stats));
1026         if (++journal->j_history_cur == journal->j_history_max)
1027                 journal->j_history_cur = 0;
1028
1029         /*
1030          * Calculate overall stats
1031          */
1032         journal->j_stats.ts_tid++;
1033         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1034         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1035         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1036         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1037         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1038         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1039         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1040         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1041         spin_unlock(&journal->j_history_lock);
1042
1043         commit_transaction->t_state = T_FINISHED;
1044         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1045         journal->j_commit_sequence = commit_transaction->t_tid;
1046         journal->j_committing_transaction = NULL;
1047         spin_unlock(&journal->j_state_lock);
1048
1049         if (commit_transaction->t_checkpoint_list == NULL &&
1050             commit_transaction->t_checkpoint_io_list == NULL) {
1051                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1052         } else {
1053                 if (journal->j_checkpoint_transactions == NULL) {
1054                         journal->j_checkpoint_transactions = commit_transaction;
1055                         commit_transaction->t_cpnext = commit_transaction;
1056                         commit_transaction->t_cpprev = commit_transaction;
1057                 } else {
1058                         commit_transaction->t_cpnext =
1059                                 journal->j_checkpoint_transactions;
1060                         commit_transaction->t_cpprev =
1061                                 commit_transaction->t_cpnext->t_cpprev;
1062                         commit_transaction->t_cpnext->t_cpprev =
1063                                 commit_transaction;
1064                         commit_transaction->t_cpprev->t_cpnext =
1065                                 commit_transaction;
1066                 }
1067         }
1068         spin_unlock(&journal->j_list_lock);
1069
1070         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1071                   journal->j_commit_sequence, journal->j_tail_sequence);
1072
1073         wake_up(&journal->j_wait_done_commit);
1074 }