Merge branch 'linus' into x86/i8259
[sfrench/cifs-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31         BUFFER_TRACE(bh, "");
32         if (uptodate)
33                 set_buffer_uptodate(bh);
34         else
35                 clear_buffer_uptodate(bh);
36         unlock_buffer(bh);
37 }
38
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not sucessfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55         struct page *page;
56
57         if (buffer_dirty(bh))
58                 goto nope;
59         if (atomic_read(&bh->b_count) != 1)
60                 goto nope;
61         page = bh->b_page;
62         if (!page)
63                 goto nope;
64         if (page->mapping)
65                 goto nope;
66
67         /* OK, it's a truncated page */
68         if (TestSetPageLocked(page))
69                 goto nope;
70
71         page_cache_get(page);
72         __brelse(bh);
73         try_to_free_buffers(page);
74         unlock_page(page);
75         page_cache_release(page);
76         return;
77
78 nope:
79         __brelse(bh);
80 }
81
82 /*
83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
85  * return 0.  j_list_lock is dropped in this case.
86  */
87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 {
89         if (!jbd_trylock_bh_state(bh)) {
90                 spin_unlock(&journal->j_list_lock);
91                 schedule();
92                 return 0;
93         }
94         return 1;
95 }
96
97 /*
98  * Done it all: now submit the commit record.  We should have
99  * cleaned up our previous buffers by now, so if we are in abort
100  * mode we can now just skip the rest of the journal write
101  * entirely.
102  *
103  * Returns 1 if the journal needs to be aborted or 0 on success
104  */
105 static int journal_submit_commit_record(journal_t *journal,
106                                         transaction_t *commit_transaction,
107                                         struct buffer_head **cbh,
108                                         __u32 crc32_sum)
109 {
110         struct journal_head *descriptor;
111         struct commit_header *tmp;
112         struct buffer_head *bh;
113         int ret;
114         int barrier_done = 0;
115
116         if (is_journal_aborted(journal))
117                 return 0;
118
119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
120         if (!descriptor)
121                 return 1;
122
123         bh = jh2bh(descriptor);
124
125         tmp = (struct commit_header *)bh->b_data;
126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
129
130         if (JBD2_HAS_COMPAT_FEATURE(journal,
131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
135         }
136
137         JBUFFER_TRACE(descriptor, "submit commit block");
138         lock_buffer(bh);
139         get_bh(bh);
140         set_buffer_dirty(bh);
141         set_buffer_uptodate(bh);
142         bh->b_end_io = journal_end_buffer_io_sync;
143
144         if (journal->j_flags & JBD2_BARRIER &&
145                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
147                 set_buffer_ordered(bh);
148                 barrier_done = 1;
149         }
150         ret = submit_bh(WRITE, bh);
151         if (barrier_done)
152                 clear_buffer_ordered(bh);
153
154         /* is it possible for another commit to fail at roughly
155          * the same time as this one?  If so, we don't want to
156          * trust the barrier flag in the super, but instead want
157          * to remember if we sent a barrier request
158          */
159         if (ret == -EOPNOTSUPP && barrier_done) {
160                 char b[BDEVNAME_SIZE];
161
162                 printk(KERN_WARNING
163                         "JBD: barrier-based sync failed on %s - "
164                         "disabling barriers\n",
165                         bdevname(journal->j_dev, b));
166                 spin_lock(&journal->j_state_lock);
167                 journal->j_flags &= ~JBD2_BARRIER;
168                 spin_unlock(&journal->j_state_lock);
169
170                 /* And try again, without the barrier */
171                 lock_buffer(bh);
172                 set_buffer_uptodate(bh);
173                 set_buffer_dirty(bh);
174                 ret = submit_bh(WRITE, bh);
175         }
176         *cbh = bh;
177         return ret;
178 }
179
180 /*
181  * This function along with journal_submit_commit_record
182  * allows to write the commit record asynchronously.
183  */
184 static int journal_wait_on_commit_record(struct buffer_head *bh)
185 {
186         int ret = 0;
187
188         clear_buffer_dirty(bh);
189         wait_on_buffer(bh);
190
191         if (unlikely(!buffer_uptodate(bh)))
192                 ret = -EIO;
193         put_bh(bh);            /* One for getblk() */
194         jbd2_journal_put_journal_head(bh2jh(bh));
195
196         return ret;
197 }
198
199 /*
200  * Wait for all submitted IO to complete.
201  */
202 static int journal_wait_on_locked_list(journal_t *journal,
203                                        transaction_t *commit_transaction)
204 {
205         int ret = 0;
206         struct journal_head *jh;
207
208         while (commit_transaction->t_locked_list) {
209                 struct buffer_head *bh;
210
211                 jh = commit_transaction->t_locked_list->b_tprev;
212                 bh = jh2bh(jh);
213                 get_bh(bh);
214                 if (buffer_locked(bh)) {
215                         spin_unlock(&journal->j_list_lock);
216                         wait_on_buffer(bh);
217                         if (unlikely(!buffer_uptodate(bh)))
218                                 ret = -EIO;
219                         spin_lock(&journal->j_list_lock);
220                 }
221                 if (!inverted_lock(journal, bh)) {
222                         put_bh(bh);
223                         spin_lock(&journal->j_list_lock);
224                         continue;
225                 }
226                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
227                         __jbd2_journal_unfile_buffer(jh);
228                         jbd_unlock_bh_state(bh);
229                         jbd2_journal_remove_journal_head(bh);
230                         put_bh(bh);
231                 } else {
232                         jbd_unlock_bh_state(bh);
233                 }
234                 put_bh(bh);
235                 cond_resched_lock(&journal->j_list_lock);
236         }
237         return ret;
238   }
239
240 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
241 {
242         int i;
243
244         for (i = 0; i < bufs; i++) {
245                 wbuf[i]->b_end_io = end_buffer_write_sync;
246                 /* We use-up our safety reference in submit_bh() */
247                 submit_bh(WRITE, wbuf[i]);
248         }
249 }
250
251 /*
252  *  Submit all the data buffers to disk
253  */
254 static void journal_submit_data_buffers(journal_t *journal,
255                                 transaction_t *commit_transaction)
256 {
257         struct journal_head *jh;
258         struct buffer_head *bh;
259         int locked;
260         int bufs = 0;
261         struct buffer_head **wbuf = journal->j_wbuf;
262
263         /*
264          * Whenever we unlock the journal and sleep, things can get added
265          * onto ->t_sync_datalist, so we have to keep looping back to
266          * write_out_data until we *know* that the list is empty.
267          *
268          * Cleanup any flushed data buffers from the data list.  Even in
269          * abort mode, we want to flush this out as soon as possible.
270          */
271 write_out_data:
272         cond_resched();
273         spin_lock(&journal->j_list_lock);
274
275         while (commit_transaction->t_sync_datalist) {
276                 jh = commit_transaction->t_sync_datalist;
277                 bh = jh2bh(jh);
278                 locked = 0;
279
280                 /* Get reference just to make sure buffer does not disappear
281                  * when we are forced to drop various locks */
282                 get_bh(bh);
283                 /* If the buffer is dirty, we need to submit IO and hence
284                  * we need the buffer lock. We try to lock the buffer without
285                  * blocking. If we fail, we need to drop j_list_lock and do
286                  * blocking lock_buffer().
287                  */
288                 if (buffer_dirty(bh)) {
289                         if (test_set_buffer_locked(bh)) {
290                                 BUFFER_TRACE(bh, "needs blocking lock");
291                                 spin_unlock(&journal->j_list_lock);
292                                 /* Write out all data to prevent deadlocks */
293                                 journal_do_submit_data(wbuf, bufs);
294                                 bufs = 0;
295                                 lock_buffer(bh);
296                                 spin_lock(&journal->j_list_lock);
297                         }
298                         locked = 1;
299                 }
300                 /* We have to get bh_state lock. Again out of order, sigh. */
301                 if (!inverted_lock(journal, bh)) {
302                         jbd_lock_bh_state(bh);
303                         spin_lock(&journal->j_list_lock);
304                 }
305                 /* Someone already cleaned up the buffer? */
306                 if (!buffer_jbd(bh)
307                         || jh->b_transaction != commit_transaction
308                         || jh->b_jlist != BJ_SyncData) {
309                         jbd_unlock_bh_state(bh);
310                         if (locked)
311                                 unlock_buffer(bh);
312                         BUFFER_TRACE(bh, "already cleaned up");
313                         put_bh(bh);
314                         continue;
315                 }
316                 if (locked && test_clear_buffer_dirty(bh)) {
317                         BUFFER_TRACE(bh, "needs writeout, adding to array");
318                         wbuf[bufs++] = bh;
319                         __jbd2_journal_file_buffer(jh, commit_transaction,
320                                                 BJ_Locked);
321                         jbd_unlock_bh_state(bh);
322                         if (bufs == journal->j_wbufsize) {
323                                 spin_unlock(&journal->j_list_lock);
324                                 journal_do_submit_data(wbuf, bufs);
325                                 bufs = 0;
326                                 goto write_out_data;
327                         }
328                 } else if (!locked && buffer_locked(bh)) {
329                         __jbd2_journal_file_buffer(jh, commit_transaction,
330                                                 BJ_Locked);
331                         jbd_unlock_bh_state(bh);
332                         put_bh(bh);
333                 } else {
334                         BUFFER_TRACE(bh, "writeout complete: unfile");
335                         __jbd2_journal_unfile_buffer(jh);
336                         jbd_unlock_bh_state(bh);
337                         if (locked)
338                                 unlock_buffer(bh);
339                         jbd2_journal_remove_journal_head(bh);
340                         /* Once for our safety reference, once for
341                          * jbd2_journal_remove_journal_head() */
342                         put_bh(bh);
343                         put_bh(bh);
344                 }
345
346                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
347                         spin_unlock(&journal->j_list_lock);
348                         goto write_out_data;
349                 }
350         }
351         spin_unlock(&journal->j_list_lock);
352         journal_do_submit_data(wbuf, bufs);
353 }
354
355 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
356 {
357         struct page *page = bh->b_page;
358         char *addr;
359         __u32 checksum;
360
361         addr = kmap_atomic(page, KM_USER0);
362         checksum = crc32_be(crc32_sum,
363                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
364         kunmap_atomic(addr, KM_USER0);
365
366         return checksum;
367 }
368
369 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
370                                    unsigned long long block)
371 {
372         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
373         if (tag_bytes > JBD2_TAG_SIZE32)
374                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
375 }
376
377 /*
378  * jbd2_journal_commit_transaction
379  *
380  * The primary function for committing a transaction to the log.  This
381  * function is called by the journal thread to begin a complete commit.
382  */
383 void jbd2_journal_commit_transaction(journal_t *journal)
384 {
385         struct transaction_stats_s stats;
386         transaction_t *commit_transaction;
387         struct journal_head *jh, *new_jh, *descriptor;
388         struct buffer_head **wbuf = journal->j_wbuf;
389         int bufs;
390         int flags;
391         int err;
392         unsigned long long blocknr;
393         char *tagp = NULL;
394         journal_header_t *header;
395         journal_block_tag_t *tag = NULL;
396         int space_left = 0;
397         int first_tag = 0;
398         int tag_flag;
399         int i;
400         int tag_bytes = journal_tag_bytes(journal);
401         struct buffer_head *cbh = NULL; /* For transactional checksums */
402         __u32 crc32_sum = ~0;
403
404         /*
405          * First job: lock down the current transaction and wait for
406          * all outstanding updates to complete.
407          */
408
409 #ifdef COMMIT_STATS
410         spin_lock(&journal->j_list_lock);
411         summarise_journal_usage(journal);
412         spin_unlock(&journal->j_list_lock);
413 #endif
414
415         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
416         if (journal->j_flags & JBD2_FLUSHED) {
417                 jbd_debug(3, "super block updated\n");
418                 jbd2_journal_update_superblock(journal, 1);
419         } else {
420                 jbd_debug(3, "superblock not updated\n");
421         }
422
423         J_ASSERT(journal->j_running_transaction != NULL);
424         J_ASSERT(journal->j_committing_transaction == NULL);
425
426         commit_transaction = journal->j_running_transaction;
427         J_ASSERT(commit_transaction->t_state == T_RUNNING);
428
429         jbd_debug(1, "JBD: starting commit of transaction %d\n",
430                         commit_transaction->t_tid);
431
432         spin_lock(&journal->j_state_lock);
433         commit_transaction->t_state = T_LOCKED;
434
435         stats.u.run.rs_wait = commit_transaction->t_max_wait;
436         stats.u.run.rs_locked = jiffies;
437         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
438                                                 stats.u.run.rs_locked);
439
440         spin_lock(&commit_transaction->t_handle_lock);
441         while (commit_transaction->t_updates) {
442                 DEFINE_WAIT(wait);
443
444                 prepare_to_wait(&journal->j_wait_updates, &wait,
445                                         TASK_UNINTERRUPTIBLE);
446                 if (commit_transaction->t_updates) {
447                         spin_unlock(&commit_transaction->t_handle_lock);
448                         spin_unlock(&journal->j_state_lock);
449                         schedule();
450                         spin_lock(&journal->j_state_lock);
451                         spin_lock(&commit_transaction->t_handle_lock);
452                 }
453                 finish_wait(&journal->j_wait_updates, &wait);
454         }
455         spin_unlock(&commit_transaction->t_handle_lock);
456
457         J_ASSERT (commit_transaction->t_outstanding_credits <=
458                         journal->j_max_transaction_buffers);
459
460         /*
461          * First thing we are allowed to do is to discard any remaining
462          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
463          * that there are no such buffers: if a large filesystem
464          * operation like a truncate needs to split itself over multiple
465          * transactions, then it may try to do a jbd2_journal_restart() while
466          * there are still BJ_Reserved buffers outstanding.  These must
467          * be released cleanly from the current transaction.
468          *
469          * In this case, the filesystem must still reserve write access
470          * again before modifying the buffer in the new transaction, but
471          * we do not require it to remember exactly which old buffers it
472          * has reserved.  This is consistent with the existing behaviour
473          * that multiple jbd2_journal_get_write_access() calls to the same
474          * buffer are perfectly permissable.
475          */
476         while (commit_transaction->t_reserved_list) {
477                 jh = commit_transaction->t_reserved_list;
478                 JBUFFER_TRACE(jh, "reserved, unused: refile");
479                 /*
480                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
481                  * leave undo-committed data.
482                  */
483                 if (jh->b_committed_data) {
484                         struct buffer_head *bh = jh2bh(jh);
485
486                         jbd_lock_bh_state(bh);
487                         jbd2_free(jh->b_committed_data, bh->b_size);
488                         jh->b_committed_data = NULL;
489                         jbd_unlock_bh_state(bh);
490                 }
491                 jbd2_journal_refile_buffer(journal, jh);
492         }
493
494         /*
495          * Now try to drop any written-back buffers from the journal's
496          * checkpoint lists.  We do this *before* commit because it potentially
497          * frees some memory
498          */
499         spin_lock(&journal->j_list_lock);
500         __jbd2_journal_clean_checkpoint_list(journal);
501         spin_unlock(&journal->j_list_lock);
502
503         jbd_debug (3, "JBD: commit phase 1\n");
504
505         /*
506          * Switch to a new revoke table.
507          */
508         jbd2_journal_switch_revoke_table(journal);
509
510         stats.u.run.rs_flushing = jiffies;
511         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
512                                                stats.u.run.rs_flushing);
513
514         commit_transaction->t_state = T_FLUSH;
515         journal->j_committing_transaction = commit_transaction;
516         journal->j_running_transaction = NULL;
517         commit_transaction->t_log_start = journal->j_head;
518         wake_up(&journal->j_wait_transaction_locked);
519         spin_unlock(&journal->j_state_lock);
520
521         jbd_debug (3, "JBD: commit phase 2\n");
522
523         /*
524          * Now start flushing things to disk, in the order they appear
525          * on the transaction lists.  Data blocks go first.
526          */
527         err = 0;
528         journal_submit_data_buffers(journal, commit_transaction);
529
530         /*
531          * Wait for all previously submitted IO to complete if commit
532          * record is to be written synchronously.
533          */
534         spin_lock(&journal->j_list_lock);
535         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
536                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
537                 err = journal_wait_on_locked_list(journal,
538                                                 commit_transaction);
539
540         spin_unlock(&journal->j_list_lock);
541
542         if (err)
543                 jbd2_journal_abort(journal, err);
544
545         jbd2_journal_write_revoke_records(journal, commit_transaction);
546
547         jbd_debug(3, "JBD: commit phase 2\n");
548
549         /*
550          * If we found any dirty or locked buffers, then we should have
551          * looped back up to the write_out_data label.  If there weren't
552          * any then journal_clean_data_list should have wiped the list
553          * clean by now, so check that it is in fact empty.
554          */
555         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
556
557         jbd_debug (3, "JBD: commit phase 3\n");
558
559         /*
560          * Way to go: we have now written out all of the data for a
561          * transaction!  Now comes the tricky part: we need to write out
562          * metadata.  Loop over the transaction's entire buffer list:
563          */
564         spin_lock(&journal->j_state_lock);
565         commit_transaction->t_state = T_COMMIT;
566         spin_unlock(&journal->j_state_lock);
567
568         stats.u.run.rs_logging = jiffies;
569         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
570                                                  stats.u.run.rs_logging);
571         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
572         stats.u.run.rs_blocks_logged = 0;
573
574         J_ASSERT(commit_transaction->t_nr_buffers <=
575                  commit_transaction->t_outstanding_credits);
576
577         descriptor = NULL;
578         bufs = 0;
579         while (commit_transaction->t_buffers) {
580
581                 /* Find the next buffer to be journaled... */
582
583                 jh = commit_transaction->t_buffers;
584
585                 /* If we're in abort mode, we just un-journal the buffer and
586                    release it for background writing. */
587
588                 if (is_journal_aborted(journal)) {
589                         JBUFFER_TRACE(jh, "journal is aborting: refile");
590                         jbd2_journal_refile_buffer(journal, jh);
591                         /* If that was the last one, we need to clean up
592                          * any descriptor buffers which may have been
593                          * already allocated, even if we are now
594                          * aborting. */
595                         if (!commit_transaction->t_buffers)
596                                 goto start_journal_io;
597                         continue;
598                 }
599
600                 /* Make sure we have a descriptor block in which to
601                    record the metadata buffer. */
602
603                 if (!descriptor) {
604                         struct buffer_head *bh;
605
606                         J_ASSERT (bufs == 0);
607
608                         jbd_debug(4, "JBD: get descriptor\n");
609
610                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
611                         if (!descriptor) {
612                                 jbd2_journal_abort(journal, -EIO);
613                                 continue;
614                         }
615
616                         bh = jh2bh(descriptor);
617                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
618                                 (unsigned long long)bh->b_blocknr, bh->b_data);
619                         header = (journal_header_t *)&bh->b_data[0];
620                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
621                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
622                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
623
624                         tagp = &bh->b_data[sizeof(journal_header_t)];
625                         space_left = bh->b_size - sizeof(journal_header_t);
626                         first_tag = 1;
627                         set_buffer_jwrite(bh);
628                         set_buffer_dirty(bh);
629                         wbuf[bufs++] = bh;
630
631                         /* Record it so that we can wait for IO
632                            completion later */
633                         BUFFER_TRACE(bh, "ph3: file as descriptor");
634                         jbd2_journal_file_buffer(descriptor, commit_transaction,
635                                         BJ_LogCtl);
636                 }
637
638                 /* Where is the buffer to be written? */
639
640                 err = jbd2_journal_next_log_block(journal, &blocknr);
641                 /* If the block mapping failed, just abandon the buffer
642                    and repeat this loop: we'll fall into the
643                    refile-on-abort condition above. */
644                 if (err) {
645                         jbd2_journal_abort(journal, err);
646                         continue;
647                 }
648
649                 /*
650                  * start_this_handle() uses t_outstanding_credits to determine
651                  * the free space in the log, but this counter is changed
652                  * by jbd2_journal_next_log_block() also.
653                  */
654                 commit_transaction->t_outstanding_credits--;
655
656                 /* Bump b_count to prevent truncate from stumbling over
657                    the shadowed buffer!  @@@ This can go if we ever get
658                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
659                 atomic_inc(&jh2bh(jh)->b_count);
660
661                 /* Make a temporary IO buffer with which to write it out
662                    (this will requeue both the metadata buffer and the
663                    temporary IO buffer). new_bh goes on BJ_IO*/
664
665                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
666                 /*
667                  * akpm: jbd2_journal_write_metadata_buffer() sets
668                  * new_bh->b_transaction to commit_transaction.
669                  * We need to clean this up before we release new_bh
670                  * (which is of type BJ_IO)
671                  */
672                 JBUFFER_TRACE(jh, "ph3: write metadata");
673                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
674                                                       jh, &new_jh, blocknr);
675                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
676                 wbuf[bufs++] = jh2bh(new_jh);
677
678                 /* Record the new block's tag in the current descriptor
679                    buffer */
680
681                 tag_flag = 0;
682                 if (flags & 1)
683                         tag_flag |= JBD2_FLAG_ESCAPE;
684                 if (!first_tag)
685                         tag_flag |= JBD2_FLAG_SAME_UUID;
686
687                 tag = (journal_block_tag_t *) tagp;
688                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
689                 tag->t_flags = cpu_to_be32(tag_flag);
690                 tagp += tag_bytes;
691                 space_left -= tag_bytes;
692
693                 if (first_tag) {
694                         memcpy (tagp, journal->j_uuid, 16);
695                         tagp += 16;
696                         space_left -= 16;
697                         first_tag = 0;
698                 }
699
700                 /* If there's no more to do, or if the descriptor is full,
701                    let the IO rip! */
702
703                 if (bufs == journal->j_wbufsize ||
704                     commit_transaction->t_buffers == NULL ||
705                     space_left < tag_bytes + 16) {
706
707                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
708
709                         /* Write an end-of-descriptor marker before
710                            submitting the IOs.  "tag" still points to
711                            the last tag we set up. */
712
713                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
714
715 start_journal_io:
716                         for (i = 0; i < bufs; i++) {
717                                 struct buffer_head *bh = wbuf[i];
718                                 /*
719                                  * Compute checksum.
720                                  */
721                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
722                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
723                                         crc32_sum =
724                                             jbd2_checksum_data(crc32_sum, bh);
725                                 }
726
727                                 lock_buffer(bh);
728                                 clear_buffer_dirty(bh);
729                                 set_buffer_uptodate(bh);
730                                 bh->b_end_io = journal_end_buffer_io_sync;
731                                 submit_bh(WRITE, bh);
732                         }
733                         cond_resched();
734                         stats.u.run.rs_blocks_logged += bufs;
735
736                         /* Force a new descriptor to be generated next
737                            time round the loop. */
738                         descriptor = NULL;
739                         bufs = 0;
740                 }
741         }
742
743         /* Done it all: now write the commit record asynchronously. */
744
745         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
746                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
747                 err = journal_submit_commit_record(journal, commit_transaction,
748                                                  &cbh, crc32_sum);
749                 if (err)
750                         __jbd2_journal_abort_hard(journal);
751
752                 spin_lock(&journal->j_list_lock);
753                 err = journal_wait_on_locked_list(journal,
754                                                 commit_transaction);
755                 spin_unlock(&journal->j_list_lock);
756                 if (err)
757                         __jbd2_journal_abort_hard(journal);
758         }
759
760         /* Lo and behold: we have just managed to send a transaction to
761            the log.  Before we can commit it, wait for the IO so far to
762            complete.  Control buffers being written are on the
763            transaction's t_log_list queue, and metadata buffers are on
764            the t_iobuf_list queue.
765
766            Wait for the buffers in reverse order.  That way we are
767            less likely to be woken up until all IOs have completed, and
768            so we incur less scheduling load.
769         */
770
771         jbd_debug(3, "JBD: commit phase 4\n");
772
773         /*
774          * akpm: these are BJ_IO, and j_list_lock is not needed.
775          * See __journal_try_to_free_buffer.
776          */
777 wait_for_iobuf:
778         while (commit_transaction->t_iobuf_list != NULL) {
779                 struct buffer_head *bh;
780
781                 jh = commit_transaction->t_iobuf_list->b_tprev;
782                 bh = jh2bh(jh);
783                 if (buffer_locked(bh)) {
784                         wait_on_buffer(bh);
785                         goto wait_for_iobuf;
786                 }
787                 if (cond_resched())
788                         goto wait_for_iobuf;
789
790                 if (unlikely(!buffer_uptodate(bh)))
791                         err = -EIO;
792
793                 clear_buffer_jwrite(bh);
794
795                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
796                 jbd2_journal_unfile_buffer(journal, jh);
797
798                 /*
799                  * ->t_iobuf_list should contain only dummy buffer_heads
800                  * which were created by jbd2_journal_write_metadata_buffer().
801                  */
802                 BUFFER_TRACE(bh, "dumping temporary bh");
803                 jbd2_journal_put_journal_head(jh);
804                 __brelse(bh);
805                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
806                 free_buffer_head(bh);
807
808                 /* We also have to unlock and free the corresponding
809                    shadowed buffer */
810                 jh = commit_transaction->t_shadow_list->b_tprev;
811                 bh = jh2bh(jh);
812                 clear_bit(BH_JWrite, &bh->b_state);
813                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
814
815                 /* The metadata is now released for reuse, but we need
816                    to remember it against this transaction so that when
817                    we finally commit, we can do any checkpointing
818                    required. */
819                 JBUFFER_TRACE(jh, "file as BJ_Forget");
820                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
821                 /* Wake up any transactions which were waiting for this
822                    IO to complete */
823                 wake_up_bit(&bh->b_state, BH_Unshadow);
824                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
825                 __brelse(bh);
826         }
827
828         J_ASSERT (commit_transaction->t_shadow_list == NULL);
829
830         jbd_debug(3, "JBD: commit phase 5\n");
831
832         /* Here we wait for the revoke record and descriptor record buffers */
833  wait_for_ctlbuf:
834         while (commit_transaction->t_log_list != NULL) {
835                 struct buffer_head *bh;
836
837                 jh = commit_transaction->t_log_list->b_tprev;
838                 bh = jh2bh(jh);
839                 if (buffer_locked(bh)) {
840                         wait_on_buffer(bh);
841                         goto wait_for_ctlbuf;
842                 }
843                 if (cond_resched())
844                         goto wait_for_ctlbuf;
845
846                 if (unlikely(!buffer_uptodate(bh)))
847                         err = -EIO;
848
849                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
850                 clear_buffer_jwrite(bh);
851                 jbd2_journal_unfile_buffer(journal, jh);
852                 jbd2_journal_put_journal_head(jh);
853                 __brelse(bh);           /* One for getblk */
854                 /* AKPM: bforget here */
855         }
856
857         jbd_debug(3, "JBD: commit phase 6\n");
858
859         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
860                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
861                 err = journal_submit_commit_record(journal, commit_transaction,
862                                                 &cbh, crc32_sum);
863                 if (err)
864                         __jbd2_journal_abort_hard(journal);
865         }
866         if (!err && !is_journal_aborted(journal))
867                 err = journal_wait_on_commit_record(cbh);
868
869         if (err)
870                 jbd2_journal_abort(journal, err);
871
872         /* End of a transaction!  Finally, we can do checkpoint
873            processing: any buffers committed as a result of this
874            transaction can be removed from any checkpoint list it was on
875            before. */
876
877         jbd_debug(3, "JBD: commit phase 7\n");
878
879         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
880         J_ASSERT(commit_transaction->t_buffers == NULL);
881         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
882         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
883         J_ASSERT(commit_transaction->t_shadow_list == NULL);
884         J_ASSERT(commit_transaction->t_log_list == NULL);
885
886 restart_loop:
887         /*
888          * As there are other places (journal_unmap_buffer()) adding buffers
889          * to this list we have to be careful and hold the j_list_lock.
890          */
891         spin_lock(&journal->j_list_lock);
892         while (commit_transaction->t_forget) {
893                 transaction_t *cp_transaction;
894                 struct buffer_head *bh;
895
896                 jh = commit_transaction->t_forget;
897                 spin_unlock(&journal->j_list_lock);
898                 bh = jh2bh(jh);
899                 jbd_lock_bh_state(bh);
900                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
901                         jh->b_transaction == journal->j_running_transaction);
902
903                 /*
904                  * If there is undo-protected committed data against
905                  * this buffer, then we can remove it now.  If it is a
906                  * buffer needing such protection, the old frozen_data
907                  * field now points to a committed version of the
908                  * buffer, so rotate that field to the new committed
909                  * data.
910                  *
911                  * Otherwise, we can just throw away the frozen data now.
912                  */
913                 if (jh->b_committed_data) {
914                         jbd2_free(jh->b_committed_data, bh->b_size);
915                         jh->b_committed_data = NULL;
916                         if (jh->b_frozen_data) {
917                                 jh->b_committed_data = jh->b_frozen_data;
918                                 jh->b_frozen_data = NULL;
919                         }
920                 } else if (jh->b_frozen_data) {
921                         jbd2_free(jh->b_frozen_data, bh->b_size);
922                         jh->b_frozen_data = NULL;
923                 }
924
925                 spin_lock(&journal->j_list_lock);
926                 cp_transaction = jh->b_cp_transaction;
927                 if (cp_transaction) {
928                         JBUFFER_TRACE(jh, "remove from old cp transaction");
929                         cp_transaction->t_chp_stats.cs_dropped++;
930                         __jbd2_journal_remove_checkpoint(jh);
931                 }
932
933                 /* Only re-checkpoint the buffer_head if it is marked
934                  * dirty.  If the buffer was added to the BJ_Forget list
935                  * by jbd2_journal_forget, it may no longer be dirty and
936                  * there's no point in keeping a checkpoint record for
937                  * it. */
938
939                 /* A buffer which has been freed while still being
940                  * journaled by a previous transaction may end up still
941                  * being dirty here, but we want to avoid writing back
942                  * that buffer in the future now that the last use has
943                  * been committed.  That's not only a performance gain,
944                  * it also stops aliasing problems if the buffer is left
945                  * behind for writeback and gets reallocated for another
946                  * use in a different page. */
947                 if (buffer_freed(bh)) {
948                         clear_buffer_freed(bh);
949                         clear_buffer_jbddirty(bh);
950                 }
951
952                 if (buffer_jbddirty(bh)) {
953                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
954                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
955                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
956                         __jbd2_journal_refile_buffer(jh);
957                         jbd_unlock_bh_state(bh);
958                 } else {
959                         J_ASSERT_BH(bh, !buffer_dirty(bh));
960                         /* The buffer on BJ_Forget list and not jbddirty means
961                          * it has been freed by this transaction and hence it
962                          * could not have been reallocated until this
963                          * transaction has committed. *BUT* it could be
964                          * reallocated once we have written all the data to
965                          * disk and before we process the buffer on BJ_Forget
966                          * list. */
967                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
968                         __jbd2_journal_refile_buffer(jh);
969                         if (!jh->b_transaction) {
970                                 jbd_unlock_bh_state(bh);
971                                  /* needs a brelse */
972                                 jbd2_journal_remove_journal_head(bh);
973                                 release_buffer_page(bh);
974                         } else
975                                 jbd_unlock_bh_state(bh);
976                 }
977                 cond_resched_lock(&journal->j_list_lock);
978         }
979         spin_unlock(&journal->j_list_lock);
980         /*
981          * This is a bit sleazy.  We use j_list_lock to protect transition
982          * of a transaction into T_FINISHED state and calling
983          * __jbd2_journal_drop_transaction(). Otherwise we could race with
984          * other checkpointing code processing the transaction...
985          */
986         spin_lock(&journal->j_state_lock);
987         spin_lock(&journal->j_list_lock);
988         /*
989          * Now recheck if some buffers did not get attached to the transaction
990          * while the lock was dropped...
991          */
992         if (commit_transaction->t_forget) {
993                 spin_unlock(&journal->j_list_lock);
994                 spin_unlock(&journal->j_state_lock);
995                 goto restart_loop;
996         }
997
998         /* Done with this transaction! */
999
1000         jbd_debug(3, "JBD: commit phase 8\n");
1001
1002         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003
1004         commit_transaction->t_start = jiffies;
1005         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1006                                                 commit_transaction->t_start);
1007
1008         /*
1009          * File the transaction for history
1010          */
1011         stats.ts_type = JBD2_STATS_RUN;
1012         stats.ts_tid = commit_transaction->t_tid;
1013         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1014         spin_lock(&journal->j_history_lock);
1015         memcpy(journal->j_history + journal->j_history_cur, &stats,
1016                         sizeof(stats));
1017         if (++journal->j_history_cur == journal->j_history_max)
1018                 journal->j_history_cur = 0;
1019
1020         /*
1021          * Calculate overall stats
1022          */
1023         journal->j_stats.ts_tid++;
1024         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1025         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1026         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1027         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1028         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1029         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1030         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1031         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1032         spin_unlock(&journal->j_history_lock);
1033
1034         commit_transaction->t_state = T_FINISHED;
1035         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1036         journal->j_commit_sequence = commit_transaction->t_tid;
1037         journal->j_committing_transaction = NULL;
1038         spin_unlock(&journal->j_state_lock);
1039
1040         if (commit_transaction->t_checkpoint_list == NULL &&
1041             commit_transaction->t_checkpoint_io_list == NULL) {
1042                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1043         } else {
1044                 if (journal->j_checkpoint_transactions == NULL) {
1045                         journal->j_checkpoint_transactions = commit_transaction;
1046                         commit_transaction->t_cpnext = commit_transaction;
1047                         commit_transaction->t_cpprev = commit_transaction;
1048                 } else {
1049                         commit_transaction->t_cpnext =
1050                                 journal->j_checkpoint_transactions;
1051                         commit_transaction->t_cpprev =
1052                                 commit_transaction->t_cpnext->t_cpprev;
1053                         commit_transaction->t_cpnext->t_cpprev =
1054                                 commit_transaction;
1055                         commit_transaction->t_cpprev->t_cpnext =
1056                                 commit_transaction;
1057                 }
1058         }
1059         spin_unlock(&journal->j_list_lock);
1060
1061         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1062                   journal->j_commit_sequence, journal->j_tail_sequence);
1063
1064         wake_up(&journal->j_wait_done_commit);
1065 }