fs/ext4: fix comments mentioning i_mutex
[sfrench/cifs-2.6.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173         BUFFER_TRACE(bh, "");
174         if (uptodate) {
175                 ext4_debug("%s: Block %lld up-to-date",
176                            __func__, bh->b_blocknr);
177                 set_buffer_uptodate(bh);
178         } else {
179                 ext4_debug("%s: Block %lld not up-to-date",
180                            __func__, bh->b_blocknr);
181                 clear_buffer_uptodate(bh);
182         }
183
184         unlock_buffer(bh);
185 }
186
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189         struct ext4_inode_info *ei = EXT4_I(inode);
190
191         ei->i_fc_lblk_start = 0;
192         ei->i_fc_lblk_len = 0;
193 }
194
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197         struct ext4_inode_info *ei = EXT4_I(inode);
198
199         ext4_fc_reset_inode(inode);
200         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201         INIT_LIST_HEAD(&ei->i_fc_list);
202         init_waitqueue_head(&ei->i_fc_wait);
203         atomic_set(&ei->i_fc_updates, 0);
204 }
205
206 /* This function must be called with sbi->s_fc_lock held. */
207 static void ext4_fc_wait_committing_inode(struct inode *inode)
208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
209 {
210         wait_queue_head_t *wq;
211         struct ext4_inode_info *ei = EXT4_I(inode);
212
213 #if (BITS_PER_LONG < 64)
214         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
215                         EXT4_STATE_FC_COMMITTING);
216         wq = bit_waitqueue(&ei->i_state_flags,
217                                 EXT4_STATE_FC_COMMITTING);
218 #else
219         DEFINE_WAIT_BIT(wait, &ei->i_flags,
220                         EXT4_STATE_FC_COMMITTING);
221         wq = bit_waitqueue(&ei->i_flags,
222                                 EXT4_STATE_FC_COMMITTING);
223 #endif
224         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
225         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
226         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
227         schedule();
228         finish_wait(wq, &wait.wq_entry);
229 }
230
231 /*
232  * Inform Ext4's fast about start of an inode update
233  *
234  * This function is called by the high level call VFS callbacks before
235  * performing any inode update. This function blocks if there's an ongoing
236  * fast commit on the inode in question.
237  */
238 void ext4_fc_start_update(struct inode *inode)
239 {
240         struct ext4_inode_info *ei = EXT4_I(inode);
241
242         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
243             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
244                 return;
245
246 restart:
247         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
248         if (list_empty(&ei->i_fc_list))
249                 goto out;
250
251         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
252                 ext4_fc_wait_committing_inode(inode);
253                 goto restart;
254         }
255 out:
256         atomic_inc(&ei->i_fc_updates);
257         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259
260 /*
261  * Stop inode update and wake up waiting fast commits if any.
262  */
263 void ext4_fc_stop_update(struct inode *inode)
264 {
265         struct ext4_inode_info *ei = EXT4_I(inode);
266
267         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
268             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
269                 return;
270
271         if (atomic_dec_and_test(&ei->i_fc_updates))
272                 wake_up_all(&ei->i_fc_wait);
273 }
274
275 /*
276  * Remove inode from fast commit list. If the inode is being committed
277  * we wait until inode commit is done.
278  */
279 void ext4_fc_del(struct inode *inode)
280 {
281         struct ext4_inode_info *ei = EXT4_I(inode);
282
283         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
284             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
285                 return;
286
287 restart:
288         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
289         if (list_empty(&ei->i_fc_list)) {
290                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
291                 return;
292         }
293
294         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
295                 ext4_fc_wait_committing_inode(inode);
296                 goto restart;
297         }
298         list_del_init(&ei->i_fc_list);
299         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
300 }
301
302 /*
303  * Mark file system as fast commit ineligible, and record latest
304  * ineligible transaction tid. This means until the recorded
305  * transaction, commit operation would result in a full jbd2 commit.
306  */
307 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
308 {
309         struct ext4_sb_info *sbi = EXT4_SB(sb);
310         tid_t tid;
311
312         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
313             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
314                 return;
315
316         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
317         if (handle && !IS_ERR(handle))
318                 tid = handle->h_transaction->t_tid;
319         else {
320                 read_lock(&sbi->s_journal->j_state_lock);
321                 tid = sbi->s_journal->j_running_transaction ?
322                                 sbi->s_journal->j_running_transaction->t_tid : 0;
323                 read_unlock(&sbi->s_journal->j_state_lock);
324         }
325         spin_lock(&sbi->s_fc_lock);
326         if (sbi->s_fc_ineligible_tid < tid)
327                 sbi->s_fc_ineligible_tid = tid;
328         spin_unlock(&sbi->s_fc_lock);
329         WARN_ON(reason >= EXT4_FC_REASON_MAX);
330         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
331 }
332
333 /*
334  * Generic fast commit tracking function. If this is the first time this we are
335  * called after a full commit, we initialize fast commit fields and then call
336  * __fc_track_fn() with update = 0. If we have already been called after a full
337  * commit, we pass update = 1. Based on that, the track function can determine
338  * if it needs to track a field for the first time or if it needs to just
339  * update the previously tracked value.
340  *
341  * If enqueue is set, this function enqueues the inode in fast commit list.
342  */
343 static int ext4_fc_track_template(
344         handle_t *handle, struct inode *inode,
345         int (*__fc_track_fn)(struct inode *, void *, bool),
346         void *args, int enqueue)
347 {
348         bool update = false;
349         struct ext4_inode_info *ei = EXT4_I(inode);
350         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
351         tid_t tid = 0;
352         int ret;
353
354         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
355             (sbi->s_mount_state & EXT4_FC_REPLAY))
356                 return -EOPNOTSUPP;
357
358         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
359                 return -EINVAL;
360
361         tid = handle->h_transaction->t_tid;
362         mutex_lock(&ei->i_fc_lock);
363         if (tid == ei->i_sync_tid) {
364                 update = true;
365         } else {
366                 ext4_fc_reset_inode(inode);
367                 ei->i_sync_tid = tid;
368         }
369         ret = __fc_track_fn(inode, args, update);
370         mutex_unlock(&ei->i_fc_lock);
371
372         if (!enqueue)
373                 return ret;
374
375         spin_lock(&sbi->s_fc_lock);
376         if (list_empty(&EXT4_I(inode)->i_fc_list))
377                 list_add_tail(&EXT4_I(inode)->i_fc_list,
378                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
379                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
380                                 &sbi->s_fc_q[FC_Q_STAGING] :
381                                 &sbi->s_fc_q[FC_Q_MAIN]);
382         spin_unlock(&sbi->s_fc_lock);
383
384         return ret;
385 }
386
387 struct __track_dentry_update_args {
388         struct dentry *dentry;
389         int op;
390 };
391
392 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
393 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
394 {
395         struct ext4_fc_dentry_update *node;
396         struct ext4_inode_info *ei = EXT4_I(inode);
397         struct __track_dentry_update_args *dentry_update =
398                 (struct __track_dentry_update_args *)arg;
399         struct dentry *dentry = dentry_update->dentry;
400         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
401
402         mutex_unlock(&ei->i_fc_lock);
403         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
404         if (!node) {
405                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
406                 mutex_lock(&ei->i_fc_lock);
407                 return -ENOMEM;
408         }
409
410         node->fcd_op = dentry_update->op;
411         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
412         node->fcd_ino = inode->i_ino;
413         if (dentry->d_name.len > DNAME_INLINE_LEN) {
414                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
415                 if (!node->fcd_name.name) {
416                         kmem_cache_free(ext4_fc_dentry_cachep, node);
417                         ext4_fc_mark_ineligible(inode->i_sb,
418                                 EXT4_FC_REASON_NOMEM, NULL);
419                         mutex_lock(&ei->i_fc_lock);
420                         return -ENOMEM;
421                 }
422                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
423                         dentry->d_name.len);
424         } else {
425                 memcpy(node->fcd_iname, dentry->d_name.name,
426                         dentry->d_name.len);
427                 node->fcd_name.name = node->fcd_iname;
428         }
429         node->fcd_name.len = dentry->d_name.len;
430
431         spin_lock(&sbi->s_fc_lock);
432         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
433                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
434                 list_add_tail(&node->fcd_list,
435                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
436         else
437                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
438         spin_unlock(&sbi->s_fc_lock);
439         mutex_lock(&ei->i_fc_lock);
440
441         return 0;
442 }
443
444 void __ext4_fc_track_unlink(handle_t *handle,
445                 struct inode *inode, struct dentry *dentry)
446 {
447         struct __track_dentry_update_args args;
448         int ret;
449
450         args.dentry = dentry;
451         args.op = EXT4_FC_TAG_UNLINK;
452
453         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
454                                         (void *)&args, 0);
455         trace_ext4_fc_track_unlink(inode, dentry, ret);
456 }
457
458 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
459 {
460         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
461 }
462
463 void __ext4_fc_track_link(handle_t *handle,
464         struct inode *inode, struct dentry *dentry)
465 {
466         struct __track_dentry_update_args args;
467         int ret;
468
469         args.dentry = dentry;
470         args.op = EXT4_FC_TAG_LINK;
471
472         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
473                                         (void *)&args, 0);
474         trace_ext4_fc_track_link(inode, dentry, ret);
475 }
476
477 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
478 {
479         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
480 }
481
482 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
483                           struct dentry *dentry)
484 {
485         struct __track_dentry_update_args args;
486         int ret;
487
488         args.dentry = dentry;
489         args.op = EXT4_FC_TAG_CREAT;
490
491         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
492                                         (void *)&args, 0);
493         trace_ext4_fc_track_create(inode, dentry, ret);
494 }
495
496 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
497 {
498         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
499 }
500
501 /* __track_fn for inode tracking */
502 static int __track_inode(struct inode *inode, void *arg, bool update)
503 {
504         if (update)
505                 return -EEXIST;
506
507         EXT4_I(inode)->i_fc_lblk_len = 0;
508
509         return 0;
510 }
511
512 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
513 {
514         int ret;
515
516         if (S_ISDIR(inode->i_mode))
517                 return;
518
519         if (ext4_should_journal_data(inode)) {
520                 ext4_fc_mark_ineligible(inode->i_sb,
521                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
522                 return;
523         }
524
525         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
526         trace_ext4_fc_track_inode(inode, ret);
527 }
528
529 struct __track_range_args {
530         ext4_lblk_t start, end;
531 };
532
533 /* __track_fn for tracking data updates */
534 static int __track_range(struct inode *inode, void *arg, bool update)
535 {
536         struct ext4_inode_info *ei = EXT4_I(inode);
537         ext4_lblk_t oldstart;
538         struct __track_range_args *__arg =
539                 (struct __track_range_args *)arg;
540
541         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
542                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
543                 return -ECANCELED;
544         }
545
546         oldstart = ei->i_fc_lblk_start;
547
548         if (update && ei->i_fc_lblk_len > 0) {
549                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
550                 ei->i_fc_lblk_len =
551                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
552                                 ei->i_fc_lblk_start + 1;
553         } else {
554                 ei->i_fc_lblk_start = __arg->start;
555                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
556         }
557
558         return 0;
559 }
560
561 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
562                          ext4_lblk_t end)
563 {
564         struct __track_range_args args;
565         int ret;
566
567         if (S_ISDIR(inode->i_mode))
568                 return;
569
570         args.start = start;
571         args.end = end;
572
573         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
574
575         trace_ext4_fc_track_range(inode, start, end, ret);
576 }
577
578 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
579 {
580         int write_flags = REQ_SYNC;
581         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
582
583         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
584         if (test_opt(sb, BARRIER) && is_tail)
585                 write_flags |= REQ_FUA | REQ_PREFLUSH;
586         lock_buffer(bh);
587         set_buffer_dirty(bh);
588         set_buffer_uptodate(bh);
589         bh->b_end_io = ext4_end_buffer_io_sync;
590         submit_bh(REQ_OP_WRITE, write_flags, bh);
591         EXT4_SB(sb)->s_fc_bh = NULL;
592 }
593
594 /* Ext4 commit path routines */
595
596 /* memzero and update CRC */
597 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
598                                 u32 *crc)
599 {
600         void *ret;
601
602         ret = memset(dst, 0, len);
603         if (crc)
604                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
605         return ret;
606 }
607
608 /*
609  * Allocate len bytes on a fast commit buffer.
610  *
611  * During the commit time this function is used to manage fast commit
612  * block space. We don't split a fast commit log onto different
613  * blocks. So this function makes sure that if there's not enough space
614  * on the current block, the remaining space in the current block is
615  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
616  * new block is from jbd2 and CRC is updated to reflect the padding
617  * we added.
618  */
619 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
620 {
621         struct ext4_fc_tl *tl;
622         struct ext4_sb_info *sbi = EXT4_SB(sb);
623         struct buffer_head *bh;
624         int bsize = sbi->s_journal->j_blocksize;
625         int ret, off = sbi->s_fc_bytes % bsize;
626         int pad_len;
627
628         /*
629          * After allocating len, we should have space at least for a 0 byte
630          * padding.
631          */
632         if (len + sizeof(struct ext4_fc_tl) > bsize)
633                 return NULL;
634
635         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
636                 /*
637                  * Only allocate from current buffer if we have enough space for
638                  * this request AND we have space to add a zero byte padding.
639                  */
640                 if (!sbi->s_fc_bh) {
641                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
642                         if (ret)
643                                 return NULL;
644                         sbi->s_fc_bh = bh;
645                 }
646                 sbi->s_fc_bytes += len;
647                 return sbi->s_fc_bh->b_data + off;
648         }
649         /* Need to add PAD tag */
650         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
651         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
652         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
653         tl->fc_len = cpu_to_le16(pad_len);
654         if (crc)
655                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
656         if (pad_len > 0)
657                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
658         ext4_fc_submit_bh(sb, false);
659
660         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
661         if (ret)
662                 return NULL;
663         sbi->s_fc_bh = bh;
664         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
665         return sbi->s_fc_bh->b_data;
666 }
667
668 /* memcpy to fc reserved space and update CRC */
669 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
670                                 int len, u32 *crc)
671 {
672         if (crc)
673                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
674         return memcpy(dst, src, len);
675 }
676
677 /*
678  * Complete a fast commit by writing tail tag.
679  *
680  * Writing tail tag marks the end of a fast commit. In order to guarantee
681  * atomicity, after writing tail tag, even if there's space remaining
682  * in the block, next commit shouldn't use it. That's why tail tag
683  * has the length as that of the remaining space on the block.
684  */
685 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
686 {
687         struct ext4_sb_info *sbi = EXT4_SB(sb);
688         struct ext4_fc_tl tl;
689         struct ext4_fc_tail tail;
690         int off, bsize = sbi->s_journal->j_blocksize;
691         u8 *dst;
692
693         /*
694          * ext4_fc_reserve_space takes care of allocating an extra block if
695          * there's no enough space on this block for accommodating this tail.
696          */
697         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
698         if (!dst)
699                 return -ENOSPC;
700
701         off = sbi->s_fc_bytes % bsize;
702
703         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
704         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
705         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
706
707         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
708         dst += sizeof(tl);
709         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
710         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
711         dst += sizeof(tail.fc_tid);
712         tail.fc_crc = cpu_to_le32(crc);
713         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
714
715         ext4_fc_submit_bh(sb, true);
716
717         return 0;
718 }
719
720 /*
721  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
722  * Returns false if there's not enough space.
723  */
724 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
725                            u32 *crc)
726 {
727         struct ext4_fc_tl tl;
728         u8 *dst;
729
730         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
731         if (!dst)
732                 return false;
733
734         tl.fc_tag = cpu_to_le16(tag);
735         tl.fc_len = cpu_to_le16(len);
736
737         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
738         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
739
740         return true;
741 }
742
743 /* Same as above, but adds dentry tlv. */
744 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
745                                    struct ext4_fc_dentry_update *fc_dentry)
746 {
747         struct ext4_fc_dentry_info fcd;
748         struct ext4_fc_tl tl;
749         int dlen = fc_dentry->fcd_name.len;
750         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
751                                         crc);
752
753         if (!dst)
754                 return false;
755
756         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
757         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
758         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
759         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
760         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
761         dst += sizeof(tl);
762         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
763         dst += sizeof(fcd);
764         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
765
766         return true;
767 }
768
769 /*
770  * Writes inode in the fast commit space under TLV with tag @tag.
771  * Returns 0 on success, error on failure.
772  */
773 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
774 {
775         struct ext4_inode_info *ei = EXT4_I(inode);
776         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
777         int ret;
778         struct ext4_iloc iloc;
779         struct ext4_fc_inode fc_inode;
780         struct ext4_fc_tl tl;
781         u8 *dst;
782
783         ret = ext4_get_inode_loc(inode, &iloc);
784         if (ret)
785                 return ret;
786
787         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
788                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
789         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
790                 inode_len += ei->i_extra_isize;
791
792         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
793         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
794         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
795
796         dst = ext4_fc_reserve_space(inode->i_sb,
797                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
798         if (!dst)
799                 return -ECANCELED;
800
801         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
802                 return -ECANCELED;
803         dst += sizeof(tl);
804         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
805                 return -ECANCELED;
806         dst += sizeof(fc_inode);
807         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
808                                         inode_len, crc))
809                 return -ECANCELED;
810
811         return 0;
812 }
813
814 /*
815  * Writes updated data ranges for the inode in question. Updates CRC.
816  * Returns 0 on success, error otherwise.
817  */
818 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
819 {
820         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
821         struct ext4_inode_info *ei = EXT4_I(inode);
822         struct ext4_map_blocks map;
823         struct ext4_fc_add_range fc_ext;
824         struct ext4_fc_del_range lrange;
825         struct ext4_extent *ex;
826         int ret;
827
828         mutex_lock(&ei->i_fc_lock);
829         if (ei->i_fc_lblk_len == 0) {
830                 mutex_unlock(&ei->i_fc_lock);
831                 return 0;
832         }
833         old_blk_size = ei->i_fc_lblk_start;
834         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
835         ei->i_fc_lblk_len = 0;
836         mutex_unlock(&ei->i_fc_lock);
837
838         cur_lblk_off = old_blk_size;
839         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
840                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
841
842         while (cur_lblk_off <= new_blk_size) {
843                 map.m_lblk = cur_lblk_off;
844                 map.m_len = new_blk_size - cur_lblk_off + 1;
845                 ret = ext4_map_blocks(NULL, inode, &map, 0);
846                 if (ret < 0)
847                         return -ECANCELED;
848
849                 if (map.m_len == 0) {
850                         cur_lblk_off++;
851                         continue;
852                 }
853
854                 if (ret == 0) {
855                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
856                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
857                         lrange.fc_len = cpu_to_le32(map.m_len);
858                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
859                                             sizeof(lrange), (u8 *)&lrange, crc))
860                                 return -ENOSPC;
861                 } else {
862                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
863                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
864
865                         /* Limit the number of blocks in one extent */
866                         map.m_len = min(max, map.m_len);
867
868                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
869                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
870                         ex->ee_block = cpu_to_le32(map.m_lblk);
871                         ex->ee_len = cpu_to_le16(map.m_len);
872                         ext4_ext_store_pblock(ex, map.m_pblk);
873                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
874                                 ext4_ext_mark_unwritten(ex);
875                         else
876                                 ext4_ext_mark_initialized(ex);
877                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
878                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
879                                 return -ENOSPC;
880                 }
881
882                 cur_lblk_off += map.m_len;
883         }
884
885         return 0;
886 }
887
888
889 /* Submit data for all the fast commit inodes */
890 static int ext4_fc_submit_inode_data_all(journal_t *journal)
891 {
892         struct super_block *sb = (struct super_block *)(journal->j_private);
893         struct ext4_sb_info *sbi = EXT4_SB(sb);
894         struct ext4_inode_info *ei;
895         int ret = 0;
896
897         spin_lock(&sbi->s_fc_lock);
898         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
899                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
900                 while (atomic_read(&ei->i_fc_updates)) {
901                         DEFINE_WAIT(wait);
902
903                         prepare_to_wait(&ei->i_fc_wait, &wait,
904                                                 TASK_UNINTERRUPTIBLE);
905                         if (atomic_read(&ei->i_fc_updates)) {
906                                 spin_unlock(&sbi->s_fc_lock);
907                                 schedule();
908                                 spin_lock(&sbi->s_fc_lock);
909                         }
910                         finish_wait(&ei->i_fc_wait, &wait);
911                 }
912                 spin_unlock(&sbi->s_fc_lock);
913                 ret = jbd2_submit_inode_data(ei->jinode);
914                 if (ret)
915                         return ret;
916                 spin_lock(&sbi->s_fc_lock);
917         }
918         spin_unlock(&sbi->s_fc_lock);
919
920         return ret;
921 }
922
923 /* Wait for completion of data for all the fast commit inodes */
924 static int ext4_fc_wait_inode_data_all(journal_t *journal)
925 {
926         struct super_block *sb = (struct super_block *)(journal->j_private);
927         struct ext4_sb_info *sbi = EXT4_SB(sb);
928         struct ext4_inode_info *pos, *n;
929         int ret = 0;
930
931         spin_lock(&sbi->s_fc_lock);
932         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
933                 if (!ext4_test_inode_state(&pos->vfs_inode,
934                                            EXT4_STATE_FC_COMMITTING))
935                         continue;
936                 spin_unlock(&sbi->s_fc_lock);
937
938                 ret = jbd2_wait_inode_data(journal, pos->jinode);
939                 if (ret)
940                         return ret;
941                 spin_lock(&sbi->s_fc_lock);
942         }
943         spin_unlock(&sbi->s_fc_lock);
944
945         return 0;
946 }
947
948 /* Commit all the directory entry updates */
949 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
950 __acquires(&sbi->s_fc_lock)
951 __releases(&sbi->s_fc_lock)
952 {
953         struct super_block *sb = (struct super_block *)(journal->j_private);
954         struct ext4_sb_info *sbi = EXT4_SB(sb);
955         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
956         struct inode *inode;
957         struct ext4_inode_info *ei, *ei_n;
958         int ret;
959
960         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
961                 return 0;
962         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
963                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
964                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
965                         spin_unlock(&sbi->s_fc_lock);
966                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
967                                 ret = -ENOSPC;
968                                 goto lock_and_exit;
969                         }
970                         spin_lock(&sbi->s_fc_lock);
971                         continue;
972                 }
973
974                 inode = NULL;
975                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
976                                          i_fc_list) {
977                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
978                                 inode = &ei->vfs_inode;
979                                 break;
980                         }
981                 }
982                 /*
983                  * If we don't find inode in our list, then it was deleted,
984                  * in which case, we don't need to record it's create tag.
985                  */
986                 if (!inode)
987                         continue;
988                 spin_unlock(&sbi->s_fc_lock);
989
990                 /*
991                  * We first write the inode and then the create dirent. This
992                  * allows the recovery code to create an unnamed inode first
993                  * and then link it to a directory entry. This allows us
994                  * to use namei.c routines almost as is and simplifies
995                  * the recovery code.
996                  */
997                 ret = ext4_fc_write_inode(inode, crc);
998                 if (ret)
999                         goto lock_and_exit;
1000
1001                 ret = ext4_fc_write_inode_data(inode, crc);
1002                 if (ret)
1003                         goto lock_and_exit;
1004
1005                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1006                         ret = -ENOSPC;
1007                         goto lock_and_exit;
1008                 }
1009
1010                 spin_lock(&sbi->s_fc_lock);
1011         }
1012         return 0;
1013 lock_and_exit:
1014         spin_lock(&sbi->s_fc_lock);
1015         return ret;
1016 }
1017
1018 static int ext4_fc_perform_commit(journal_t *journal)
1019 {
1020         struct super_block *sb = (struct super_block *)(journal->j_private);
1021         struct ext4_sb_info *sbi = EXT4_SB(sb);
1022         struct ext4_inode_info *iter;
1023         struct ext4_fc_head head;
1024         struct inode *inode;
1025         struct blk_plug plug;
1026         int ret = 0;
1027         u32 crc = 0;
1028
1029         ret = ext4_fc_submit_inode_data_all(journal);
1030         if (ret)
1031                 return ret;
1032
1033         ret = ext4_fc_wait_inode_data_all(journal);
1034         if (ret)
1035                 return ret;
1036
1037         /*
1038          * If file system device is different from journal device, issue a cache
1039          * flush before we start writing fast commit blocks.
1040          */
1041         if (journal->j_fs_dev != journal->j_dev)
1042                 blkdev_issue_flush(journal->j_fs_dev);
1043
1044         blk_start_plug(&plug);
1045         if (sbi->s_fc_bytes == 0) {
1046                 /*
1047                  * Add a head tag only if this is the first fast commit
1048                  * in this TID.
1049                  */
1050                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1051                 head.fc_tid = cpu_to_le32(
1052                         sbi->s_journal->j_running_transaction->t_tid);
1053                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1054                         (u8 *)&head, &crc)) {
1055                         ret = -ENOSPC;
1056                         goto out;
1057                 }
1058         }
1059
1060         spin_lock(&sbi->s_fc_lock);
1061         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1062         if (ret) {
1063                 spin_unlock(&sbi->s_fc_lock);
1064                 goto out;
1065         }
1066
1067         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1068                 inode = &iter->vfs_inode;
1069                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1070                         continue;
1071
1072                 spin_unlock(&sbi->s_fc_lock);
1073                 ret = ext4_fc_write_inode_data(inode, &crc);
1074                 if (ret)
1075                         goto out;
1076                 ret = ext4_fc_write_inode(inode, &crc);
1077                 if (ret)
1078                         goto out;
1079                 spin_lock(&sbi->s_fc_lock);
1080         }
1081         spin_unlock(&sbi->s_fc_lock);
1082
1083         ret = ext4_fc_write_tail(sb, crc);
1084
1085 out:
1086         blk_finish_plug(&plug);
1087         return ret;
1088 }
1089
1090 static void ext4_fc_update_stats(struct super_block *sb, int status,
1091                                  u64 commit_time, int nblks)
1092 {
1093         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1094
1095         jbd_debug(1, "Fast commit ended with status = %d", status);
1096         if (status == EXT4_FC_STATUS_OK) {
1097                 stats->fc_num_commits++;
1098                 stats->fc_numblks += nblks;
1099                 if (likely(stats->s_fc_avg_commit_time))
1100                         stats->s_fc_avg_commit_time =
1101                                 (commit_time +
1102                                  stats->s_fc_avg_commit_time * 3) / 4;
1103                 else
1104                         stats->s_fc_avg_commit_time = commit_time;
1105         } else if (status == EXT4_FC_STATUS_FAILED ||
1106                    status == EXT4_FC_STATUS_INELIGIBLE) {
1107                 if (status == EXT4_FC_STATUS_FAILED)
1108                         stats->fc_failed_commits++;
1109                 stats->fc_ineligible_commits++;
1110         } else {
1111                 stats->fc_skipped_commits++;
1112         }
1113         trace_ext4_fc_commit_stop(sb, nblks, status);
1114 }
1115
1116 /*
1117  * The main commit entry point. Performs a fast commit for transaction
1118  * commit_tid if needed. If it's not possible to perform a fast commit
1119  * due to various reasons, we fall back to full commit. Returns 0
1120  * on success, error otherwise.
1121  */
1122 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1123 {
1124         struct super_block *sb = (struct super_block *)(journal->j_private);
1125         struct ext4_sb_info *sbi = EXT4_SB(sb);
1126         int nblks = 0, ret, bsize = journal->j_blocksize;
1127         int subtid = atomic_read(&sbi->s_fc_subtid);
1128         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1129         ktime_t start_time, commit_time;
1130
1131         trace_ext4_fc_commit_start(sb);
1132
1133         start_time = ktime_get();
1134
1135         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1136                 return jbd2_complete_transaction(journal, commit_tid);
1137
1138 restart_fc:
1139         ret = jbd2_fc_begin_commit(journal, commit_tid);
1140         if (ret == -EALREADY) {
1141                 /* There was an ongoing commit, check if we need to restart */
1142                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1143                         commit_tid > journal->j_commit_sequence)
1144                         goto restart_fc;
1145                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1146                 return 0;
1147         } else if (ret) {
1148                 /*
1149                  * Commit couldn't start. Just update stats and perform a
1150                  * full commit.
1151                  */
1152                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1153                 return jbd2_complete_transaction(journal, commit_tid);
1154         }
1155
1156         /*
1157          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1158          * if we are fast commit ineligible.
1159          */
1160         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1161                 status = EXT4_FC_STATUS_INELIGIBLE;
1162                 goto fallback;
1163         }
1164
1165         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1166         ret = ext4_fc_perform_commit(journal);
1167         if (ret < 0) {
1168                 status = EXT4_FC_STATUS_FAILED;
1169                 goto fallback;
1170         }
1171         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172         ret = jbd2_fc_wait_bufs(journal, nblks);
1173         if (ret < 0) {
1174                 status = EXT4_FC_STATUS_FAILED;
1175                 goto fallback;
1176         }
1177         atomic_inc(&sbi->s_fc_subtid);
1178         ret = jbd2_fc_end_commit(journal);
1179         /*
1180          * weight the commit time higher than the average time so we
1181          * don't react too strongly to vast changes in the commit time
1182          */
1183         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1184         ext4_fc_update_stats(sb, status, commit_time, nblks);
1185         return ret;
1186
1187 fallback:
1188         ret = jbd2_fc_end_commit_fallback(journal);
1189         ext4_fc_update_stats(sb, status, 0, 0);
1190         return ret;
1191 }
1192
1193 /*
1194  * Fast commit cleanup routine. This is called after every fast commit and
1195  * full commit. full is true if we are called after a full commit.
1196  */
1197 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1198 {
1199         struct super_block *sb = journal->j_private;
1200         struct ext4_sb_info *sbi = EXT4_SB(sb);
1201         struct ext4_inode_info *iter, *iter_n;
1202         struct ext4_fc_dentry_update *fc_dentry;
1203
1204         if (full && sbi->s_fc_bh)
1205                 sbi->s_fc_bh = NULL;
1206
1207         jbd2_fc_release_bufs(journal);
1208
1209         spin_lock(&sbi->s_fc_lock);
1210         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1211                                  i_fc_list) {
1212                 list_del_init(&iter->i_fc_list);
1213                 ext4_clear_inode_state(&iter->vfs_inode,
1214                                        EXT4_STATE_FC_COMMITTING);
1215                 if (iter->i_sync_tid <= tid)
1216                         ext4_fc_reset_inode(&iter->vfs_inode);
1217                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1218                 smp_mb();
1219 #if (BITS_PER_LONG < 64)
1220                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1221 #else
1222                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1223 #endif
1224         }
1225
1226         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1227                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1228                                              struct ext4_fc_dentry_update,
1229                                              fcd_list);
1230                 list_del_init(&fc_dentry->fcd_list);
1231                 spin_unlock(&sbi->s_fc_lock);
1232
1233                 if (fc_dentry->fcd_name.name &&
1234                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1235                         kfree(fc_dentry->fcd_name.name);
1236                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1237                 spin_lock(&sbi->s_fc_lock);
1238         }
1239
1240         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1241                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1242         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1243                                 &sbi->s_fc_q[FC_Q_MAIN]);
1244
1245         if (tid >= sbi->s_fc_ineligible_tid) {
1246                 sbi->s_fc_ineligible_tid = 0;
1247                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1248         }
1249
1250         if (full)
1251                 sbi->s_fc_bytes = 0;
1252         spin_unlock(&sbi->s_fc_lock);
1253         trace_ext4_fc_stats(sb);
1254 }
1255
1256 /* Ext4 Replay Path Routines */
1257
1258 /* Helper struct for dentry replay routines */
1259 struct dentry_info_args {
1260         int parent_ino, dname_len, ino, inode_len;
1261         char *dname;
1262 };
1263
1264 static inline void tl_to_darg(struct dentry_info_args *darg,
1265                               struct  ext4_fc_tl *tl, u8 *val)
1266 {
1267         struct ext4_fc_dentry_info fcd;
1268
1269         memcpy(&fcd, val, sizeof(fcd));
1270
1271         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1272         darg->ino = le32_to_cpu(fcd.fc_ino);
1273         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1274         darg->dname_len = le16_to_cpu(tl->fc_len) -
1275                 sizeof(struct ext4_fc_dentry_info);
1276 }
1277
1278 /* Unlink replay function */
1279 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1280                                  u8 *val)
1281 {
1282         struct inode *inode, *old_parent;
1283         struct qstr entry;
1284         struct dentry_info_args darg;
1285         int ret = 0;
1286
1287         tl_to_darg(&darg, tl, val);
1288
1289         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1290                         darg.parent_ino, darg.dname_len);
1291
1292         entry.name = darg.dname;
1293         entry.len = darg.dname_len;
1294         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1295
1296         if (IS_ERR(inode)) {
1297                 jbd_debug(1, "Inode %d not found", darg.ino);
1298                 return 0;
1299         }
1300
1301         old_parent = ext4_iget(sb, darg.parent_ino,
1302                                 EXT4_IGET_NORMAL);
1303         if (IS_ERR(old_parent)) {
1304                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1305                 iput(inode);
1306                 return 0;
1307         }
1308
1309         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1310         /* -ENOENT ok coz it might not exist anymore. */
1311         if (ret == -ENOENT)
1312                 ret = 0;
1313         iput(old_parent);
1314         iput(inode);
1315         return ret;
1316 }
1317
1318 static int ext4_fc_replay_link_internal(struct super_block *sb,
1319                                 struct dentry_info_args *darg,
1320                                 struct inode *inode)
1321 {
1322         struct inode *dir = NULL;
1323         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1324         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1325         int ret = 0;
1326
1327         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1328         if (IS_ERR(dir)) {
1329                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1330                 dir = NULL;
1331                 goto out;
1332         }
1333
1334         dentry_dir = d_obtain_alias(dir);
1335         if (IS_ERR(dentry_dir)) {
1336                 jbd_debug(1, "Failed to obtain dentry");
1337                 dentry_dir = NULL;
1338                 goto out;
1339         }
1340
1341         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1342         if (!dentry_inode) {
1343                 jbd_debug(1, "Inode dentry not created.");
1344                 ret = -ENOMEM;
1345                 goto out;
1346         }
1347
1348         ret = __ext4_link(dir, inode, dentry_inode);
1349         /*
1350          * It's possible that link already existed since data blocks
1351          * for the dir in question got persisted before we crashed OR
1352          * we replayed this tag and crashed before the entire replay
1353          * could complete.
1354          */
1355         if (ret && ret != -EEXIST) {
1356                 jbd_debug(1, "Failed to link\n");
1357                 goto out;
1358         }
1359
1360         ret = 0;
1361 out:
1362         if (dentry_dir) {
1363                 d_drop(dentry_dir);
1364                 dput(dentry_dir);
1365         } else if (dir) {
1366                 iput(dir);
1367         }
1368         if (dentry_inode) {
1369                 d_drop(dentry_inode);
1370                 dput(dentry_inode);
1371         }
1372
1373         return ret;
1374 }
1375
1376 /* Link replay function */
1377 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1378                                u8 *val)
1379 {
1380         struct inode *inode;
1381         struct dentry_info_args darg;
1382         int ret = 0;
1383
1384         tl_to_darg(&darg, tl, val);
1385         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1386                         darg.parent_ino, darg.dname_len);
1387
1388         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1389         if (IS_ERR(inode)) {
1390                 jbd_debug(1, "Inode not found.");
1391                 return 0;
1392         }
1393
1394         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1395         iput(inode);
1396         return ret;
1397 }
1398
1399 /*
1400  * Record all the modified inodes during replay. We use this later to setup
1401  * block bitmaps correctly.
1402  */
1403 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1404 {
1405         struct ext4_fc_replay_state *state;
1406         int i;
1407
1408         state = &EXT4_SB(sb)->s_fc_replay_state;
1409         for (i = 0; i < state->fc_modified_inodes_used; i++)
1410                 if (state->fc_modified_inodes[i] == ino)
1411                         return 0;
1412         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1413                 state->fc_modified_inodes = krealloc(
1414                                 state->fc_modified_inodes,
1415                                 sizeof(int) * (state->fc_modified_inodes_size +
1416                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1417                                 GFP_KERNEL);
1418                 if (!state->fc_modified_inodes)
1419                         return -ENOMEM;
1420                 state->fc_modified_inodes_size +=
1421                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1422         }
1423         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1424         return 0;
1425 }
1426
1427 /*
1428  * Inode replay function
1429  */
1430 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1431                                 u8 *val)
1432 {
1433         struct ext4_fc_inode fc_inode;
1434         struct ext4_inode *raw_inode;
1435         struct ext4_inode *raw_fc_inode;
1436         struct inode *inode = NULL;
1437         struct ext4_iloc iloc;
1438         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1439         struct ext4_extent_header *eh;
1440
1441         memcpy(&fc_inode, val, sizeof(fc_inode));
1442
1443         ino = le32_to_cpu(fc_inode.fc_ino);
1444         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1445
1446         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1447         if (!IS_ERR(inode)) {
1448                 ext4_ext_clear_bb(inode);
1449                 iput(inode);
1450         }
1451         inode = NULL;
1452
1453         ret = ext4_fc_record_modified_inode(sb, ino);
1454         if (ret)
1455                 goto out;
1456
1457         raw_fc_inode = (struct ext4_inode *)
1458                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1459         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1460         if (ret)
1461                 goto out;
1462
1463         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1464         raw_inode = ext4_raw_inode(&iloc);
1465
1466         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1467         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1468                 inode_len - offsetof(struct ext4_inode, i_generation));
1469         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1470                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1471                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1472                         memset(eh, 0, sizeof(*eh));
1473                         eh->eh_magic = EXT4_EXT_MAGIC;
1474                         eh->eh_max = cpu_to_le16(
1475                                 (sizeof(raw_inode->i_block) -
1476                                  sizeof(struct ext4_extent_header))
1477                                  / sizeof(struct ext4_extent));
1478                 }
1479         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1480                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1481                         sizeof(raw_inode->i_block));
1482         }
1483
1484         /* Immediately update the inode on disk. */
1485         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1486         if (ret)
1487                 goto out;
1488         ret = sync_dirty_buffer(iloc.bh);
1489         if (ret)
1490                 goto out;
1491         ret = ext4_mark_inode_used(sb, ino);
1492         if (ret)
1493                 goto out;
1494
1495         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1496         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1497         if (IS_ERR(inode)) {
1498                 jbd_debug(1, "Inode not found.");
1499                 return -EFSCORRUPTED;
1500         }
1501
1502         /*
1503          * Our allocator could have made different decisions than before
1504          * crashing. This should be fixed but until then, we calculate
1505          * the number of blocks the inode.
1506          */
1507         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1508                 ext4_ext_replay_set_iblocks(inode);
1509
1510         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1511         ext4_reset_inode_seed(inode);
1512
1513         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1514         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515         sync_dirty_buffer(iloc.bh);
1516         brelse(iloc.bh);
1517 out:
1518         iput(inode);
1519         if (!ret)
1520                 blkdev_issue_flush(sb->s_bdev);
1521
1522         return 0;
1523 }
1524
1525 /*
1526  * Dentry create replay function.
1527  *
1528  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1529  * inode for which we are trying to create a dentry here, should already have
1530  * been replayed before we start here.
1531  */
1532 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1533                                  u8 *val)
1534 {
1535         int ret = 0;
1536         struct inode *inode = NULL;
1537         struct inode *dir = NULL;
1538         struct dentry_info_args darg;
1539
1540         tl_to_darg(&darg, tl, val);
1541
1542         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1543                         darg.parent_ino, darg.dname_len);
1544
1545         /* This takes care of update group descriptor and other metadata */
1546         ret = ext4_mark_inode_used(sb, darg.ino);
1547         if (ret)
1548                 goto out;
1549
1550         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1551         if (IS_ERR(inode)) {
1552                 jbd_debug(1, "inode %d not found.", darg.ino);
1553                 inode = NULL;
1554                 ret = -EINVAL;
1555                 goto out;
1556         }
1557
1558         if (S_ISDIR(inode->i_mode)) {
1559                 /*
1560                  * If we are creating a directory, we need to make sure that the
1561                  * dot and dot dot dirents are setup properly.
1562                  */
1563                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1564                 if (IS_ERR(dir)) {
1565                         jbd_debug(1, "Dir %d not found.", darg.ino);
1566                         goto out;
1567                 }
1568                 ret = ext4_init_new_dir(NULL, dir, inode);
1569                 iput(dir);
1570                 if (ret) {
1571                         ret = 0;
1572                         goto out;
1573                 }
1574         }
1575         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1576         if (ret)
1577                 goto out;
1578         set_nlink(inode, 1);
1579         ext4_mark_inode_dirty(NULL, inode);
1580 out:
1581         if (inode)
1582                 iput(inode);
1583         return ret;
1584 }
1585
1586 /*
1587  * Record physical disk regions which are in use as per fast commit area,
1588  * and used by inodes during replay phase. Our simple replay phase
1589  * allocator excludes these regions from allocation.
1590  */
1591 int ext4_fc_record_regions(struct super_block *sb, int ino,
1592                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1593 {
1594         struct ext4_fc_replay_state *state;
1595         struct ext4_fc_alloc_region *region;
1596
1597         state = &EXT4_SB(sb)->s_fc_replay_state;
1598         /*
1599          * during replay phase, the fc_regions_valid may not same as
1600          * fc_regions_used, update it when do new additions.
1601          */
1602         if (replay && state->fc_regions_used != state->fc_regions_valid)
1603                 state->fc_regions_used = state->fc_regions_valid;
1604         if (state->fc_regions_used == state->fc_regions_size) {
1605                 state->fc_regions_size +=
1606                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1607                 state->fc_regions = krealloc(
1608                                         state->fc_regions,
1609                                         state->fc_regions_size *
1610                                         sizeof(struct ext4_fc_alloc_region),
1611                                         GFP_KERNEL);
1612                 if (!state->fc_regions)
1613                         return -ENOMEM;
1614         }
1615         region = &state->fc_regions[state->fc_regions_used++];
1616         region->ino = ino;
1617         region->lblk = lblk;
1618         region->pblk = pblk;
1619         region->len = len;
1620
1621         if (replay)
1622                 state->fc_regions_valid++;
1623
1624         return 0;
1625 }
1626
1627 /* Replay add range tag */
1628 static int ext4_fc_replay_add_range(struct super_block *sb,
1629                                     struct ext4_fc_tl *tl, u8 *val)
1630 {
1631         struct ext4_fc_add_range fc_add_ex;
1632         struct ext4_extent newex, *ex;
1633         struct inode *inode;
1634         ext4_lblk_t start, cur;
1635         int remaining, len;
1636         ext4_fsblk_t start_pblk;
1637         struct ext4_map_blocks map;
1638         struct ext4_ext_path *path = NULL;
1639         int ret;
1640
1641         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1642         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1643
1644         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1645                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1646                 ext4_ext_get_actual_len(ex));
1647
1648         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1649         if (IS_ERR(inode)) {
1650                 jbd_debug(1, "Inode not found.");
1651                 return 0;
1652         }
1653
1654         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1655         if (ret)
1656                 goto out;
1657
1658         start = le32_to_cpu(ex->ee_block);
1659         start_pblk = ext4_ext_pblock(ex);
1660         len = ext4_ext_get_actual_len(ex);
1661
1662         cur = start;
1663         remaining = len;
1664         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1665                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1666                   inode->i_ino);
1667
1668         while (remaining > 0) {
1669                 map.m_lblk = cur;
1670                 map.m_len = remaining;
1671                 map.m_pblk = 0;
1672                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1673
1674                 if (ret < 0)
1675                         goto out;
1676
1677                 if (ret == 0) {
1678                         /* Range is not mapped */
1679                         path = ext4_find_extent(inode, cur, NULL, 0);
1680                         if (IS_ERR(path))
1681                                 goto out;
1682                         memset(&newex, 0, sizeof(newex));
1683                         newex.ee_block = cpu_to_le32(cur);
1684                         ext4_ext_store_pblock(
1685                                 &newex, start_pblk + cur - start);
1686                         newex.ee_len = cpu_to_le16(map.m_len);
1687                         if (ext4_ext_is_unwritten(ex))
1688                                 ext4_ext_mark_unwritten(&newex);
1689                         down_write(&EXT4_I(inode)->i_data_sem);
1690                         ret = ext4_ext_insert_extent(
1691                                 NULL, inode, &path, &newex, 0);
1692                         up_write((&EXT4_I(inode)->i_data_sem));
1693                         ext4_ext_drop_refs(path);
1694                         kfree(path);
1695                         if (ret)
1696                                 goto out;
1697                         goto next;
1698                 }
1699
1700                 if (start_pblk + cur - start != map.m_pblk) {
1701                         /*
1702                          * Logical to physical mapping changed. This can happen
1703                          * if this range was removed and then reallocated to
1704                          * map to new physical blocks during a fast commit.
1705                          */
1706                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1707                                         ext4_ext_is_unwritten(ex),
1708                                         start_pblk + cur - start);
1709                         if (ret)
1710                                 goto out;
1711                         /*
1712                          * Mark the old blocks as free since they aren't used
1713                          * anymore. We maintain an array of all the modified
1714                          * inodes. In case these blocks are still used at either
1715                          * a different logical range in the same inode or in
1716                          * some different inode, we will mark them as allocated
1717                          * at the end of the FC replay using our array of
1718                          * modified inodes.
1719                          */
1720                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1721                         goto next;
1722                 }
1723
1724                 /* Range is mapped and needs a state change */
1725                 jbd_debug(1, "Converting from %ld to %d %lld",
1726                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1727                         ext4_ext_is_unwritten(ex), map.m_pblk);
1728                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1730                 if (ret)
1731                         goto out;
1732                 /*
1733                  * We may have split the extent tree while toggling the state.
1734                  * Try to shrink the extent tree now.
1735                  */
1736                 ext4_ext_replay_shrink_inode(inode, start + len);
1737 next:
1738                 cur += map.m_len;
1739                 remaining -= map.m_len;
1740         }
1741         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1742                                         sb->s_blocksize_bits);
1743 out:
1744         iput(inode);
1745         return 0;
1746 }
1747
1748 /* Replay DEL_RANGE tag */
1749 static int
1750 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1751                          u8 *val)
1752 {
1753         struct inode *inode;
1754         struct ext4_fc_del_range lrange;
1755         struct ext4_map_blocks map;
1756         ext4_lblk_t cur, remaining;
1757         int ret;
1758
1759         memcpy(&lrange, val, sizeof(lrange));
1760         cur = le32_to_cpu(lrange.fc_lblk);
1761         remaining = le32_to_cpu(lrange.fc_len);
1762
1763         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1764                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1765
1766         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1767         if (IS_ERR(inode)) {
1768                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1769                 return 0;
1770         }
1771
1772         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1773         if (ret)
1774                 goto out;
1775
1776         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1777                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1778                         le32_to_cpu(lrange.fc_len));
1779         while (remaining > 0) {
1780                 map.m_lblk = cur;
1781                 map.m_len = remaining;
1782
1783                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1784                 if (ret < 0)
1785                         goto out;
1786                 if (ret > 0) {
1787                         remaining -= ret;
1788                         cur += ret;
1789                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1790                 } else {
1791                         remaining -= map.m_len;
1792                         cur += map.m_len;
1793                 }
1794         }
1795
1796         down_write(&EXT4_I(inode)->i_data_sem);
1797         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1798                                 le32_to_cpu(lrange.fc_lblk) +
1799                                 le32_to_cpu(lrange.fc_len) - 1);
1800         up_write(&EXT4_I(inode)->i_data_sem);
1801         if (ret)
1802                 goto out;
1803         ext4_ext_replay_shrink_inode(inode,
1804                 i_size_read(inode) >> sb->s_blocksize_bits);
1805         ext4_mark_inode_dirty(NULL, inode);
1806 out:
1807         iput(inode);
1808         return 0;
1809 }
1810
1811 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1812 {
1813         struct ext4_fc_replay_state *state;
1814         struct inode *inode;
1815         struct ext4_ext_path *path = NULL;
1816         struct ext4_map_blocks map;
1817         int i, ret, j;
1818         ext4_lblk_t cur, end;
1819
1820         state = &EXT4_SB(sb)->s_fc_replay_state;
1821         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1822                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1823                         EXT4_IGET_NORMAL);
1824                 if (IS_ERR(inode)) {
1825                         jbd_debug(1, "Inode %d not found.",
1826                                 state->fc_modified_inodes[i]);
1827                         continue;
1828                 }
1829                 cur = 0;
1830                 end = EXT_MAX_BLOCKS;
1831                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1832                         iput(inode);
1833                         continue;
1834                 }
1835                 while (cur < end) {
1836                         map.m_lblk = cur;
1837                         map.m_len = end - cur;
1838
1839                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1840                         if (ret < 0)
1841                                 break;
1842
1843                         if (ret > 0) {
1844                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1845                                 if (!IS_ERR(path)) {
1846                                         for (j = 0; j < path->p_depth; j++)
1847                                                 ext4_mb_mark_bb(inode->i_sb,
1848                                                         path[j].p_block, 1, 1);
1849                                         ext4_ext_drop_refs(path);
1850                                         kfree(path);
1851                                 }
1852                                 cur += ret;
1853                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1854                                                         map.m_len, 1);
1855                         } else {
1856                                 cur = cur + (map.m_len ? map.m_len : 1);
1857                         }
1858                 }
1859                 iput(inode);
1860         }
1861 }
1862
1863 /*
1864  * Check if block is in excluded regions for block allocation. The simple
1865  * allocator that runs during replay phase is calls this function to see
1866  * if it is okay to use a block.
1867  */
1868 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1869 {
1870         int i;
1871         struct ext4_fc_replay_state *state;
1872
1873         state = &EXT4_SB(sb)->s_fc_replay_state;
1874         for (i = 0; i < state->fc_regions_valid; i++) {
1875                 if (state->fc_regions[i].ino == 0 ||
1876                         state->fc_regions[i].len == 0)
1877                         continue;
1878                 if (blk >= state->fc_regions[i].pblk &&
1879                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1880                         return true;
1881         }
1882         return false;
1883 }
1884
1885 /* Cleanup function called after replay */
1886 void ext4_fc_replay_cleanup(struct super_block *sb)
1887 {
1888         struct ext4_sb_info *sbi = EXT4_SB(sb);
1889
1890         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1891         kfree(sbi->s_fc_replay_state.fc_regions);
1892         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1893 }
1894
1895 /*
1896  * Recovery Scan phase handler
1897  *
1898  * This function is called during the scan phase and is responsible
1899  * for doing following things:
1900  * - Make sure the fast commit area has valid tags for replay
1901  * - Count number of tags that need to be replayed by the replay handler
1902  * - Verify CRC
1903  * - Create a list of excluded blocks for allocation during replay phase
1904  *
1905  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1906  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1907  * to indicate that scan has finished and JBD2 can now start replay phase.
1908  * It returns a negative error to indicate that there was an error. At the end
1909  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1910  * to indicate the number of tags that need to replayed during the replay phase.
1911  */
1912 static int ext4_fc_replay_scan(journal_t *journal,
1913                                 struct buffer_head *bh, int off,
1914                                 tid_t expected_tid)
1915 {
1916         struct super_block *sb = journal->j_private;
1917         struct ext4_sb_info *sbi = EXT4_SB(sb);
1918         struct ext4_fc_replay_state *state;
1919         int ret = JBD2_FC_REPLAY_CONTINUE;
1920         struct ext4_fc_add_range ext;
1921         struct ext4_fc_tl tl;
1922         struct ext4_fc_tail tail;
1923         __u8 *start, *end, *cur, *val;
1924         struct ext4_fc_head head;
1925         struct ext4_extent *ex;
1926
1927         state = &sbi->s_fc_replay_state;
1928
1929         start = (u8 *)bh->b_data;
1930         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1931
1932         if (state->fc_replay_expected_off == 0) {
1933                 state->fc_cur_tag = 0;
1934                 state->fc_replay_num_tags = 0;
1935                 state->fc_crc = 0;
1936                 state->fc_regions = NULL;
1937                 state->fc_regions_valid = state->fc_regions_used =
1938                         state->fc_regions_size = 0;
1939                 /* Check if we can stop early */
1940                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1941                         != EXT4_FC_TAG_HEAD)
1942                         return 0;
1943         }
1944
1945         if (off != state->fc_replay_expected_off) {
1946                 ret = -EFSCORRUPTED;
1947                 goto out_err;
1948         }
1949
1950         state->fc_replay_expected_off++;
1951         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1952                 memcpy(&tl, cur, sizeof(tl));
1953                 val = cur + sizeof(tl);
1954                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1955                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1956                 switch (le16_to_cpu(tl.fc_tag)) {
1957                 case EXT4_FC_TAG_ADD_RANGE:
1958                         memcpy(&ext, val, sizeof(ext));
1959                         ex = (struct ext4_extent *)&ext.fc_ex;
1960                         ret = ext4_fc_record_regions(sb,
1961                                 le32_to_cpu(ext.fc_ino),
1962                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1963                                 ext4_ext_get_actual_len(ex), 0);
1964                         if (ret < 0)
1965                                 break;
1966                         ret = JBD2_FC_REPLAY_CONTINUE;
1967                         fallthrough;
1968                 case EXT4_FC_TAG_DEL_RANGE:
1969                 case EXT4_FC_TAG_LINK:
1970                 case EXT4_FC_TAG_UNLINK:
1971                 case EXT4_FC_TAG_CREAT:
1972                 case EXT4_FC_TAG_INODE:
1973                 case EXT4_FC_TAG_PAD:
1974                         state->fc_cur_tag++;
1975                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1976                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1977                         break;
1978                 case EXT4_FC_TAG_TAIL:
1979                         state->fc_cur_tag++;
1980                         memcpy(&tail, val, sizeof(tail));
1981                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1982                                                 sizeof(tl) +
1983                                                 offsetof(struct ext4_fc_tail,
1984                                                 fc_crc));
1985                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1986                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1987                                 state->fc_replay_num_tags = state->fc_cur_tag;
1988                                 state->fc_regions_valid =
1989                                         state->fc_regions_used;
1990                         } else {
1991                                 ret = state->fc_replay_num_tags ?
1992                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1993                         }
1994                         state->fc_crc = 0;
1995                         break;
1996                 case EXT4_FC_TAG_HEAD:
1997                         memcpy(&head, val, sizeof(head));
1998                         if (le32_to_cpu(head.fc_features) &
1999                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2000                                 ret = -EOPNOTSUPP;
2001                                 break;
2002                         }
2003                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2004                                 ret = JBD2_FC_REPLAY_STOP;
2005                                 break;
2006                         }
2007                         state->fc_cur_tag++;
2008                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2009                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2010                         break;
2011                 default:
2012                         ret = state->fc_replay_num_tags ?
2013                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2014                 }
2015                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2016                         break;
2017         }
2018
2019 out_err:
2020         trace_ext4_fc_replay_scan(sb, ret, off);
2021         return ret;
2022 }
2023
2024 /*
2025  * Main recovery path entry point.
2026  * The meaning of return codes is similar as above.
2027  */
2028 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2029                                 enum passtype pass, int off, tid_t expected_tid)
2030 {
2031         struct super_block *sb = journal->j_private;
2032         struct ext4_sb_info *sbi = EXT4_SB(sb);
2033         struct ext4_fc_tl tl;
2034         __u8 *start, *end, *cur, *val;
2035         int ret = JBD2_FC_REPLAY_CONTINUE;
2036         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2037         struct ext4_fc_tail tail;
2038
2039         if (pass == PASS_SCAN) {
2040                 state->fc_current_pass = PASS_SCAN;
2041                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2042         }
2043
2044         if (state->fc_current_pass != pass) {
2045                 state->fc_current_pass = pass;
2046                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2047         }
2048         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2049                 jbd_debug(1, "Replay stops\n");
2050                 ext4_fc_set_bitmaps_and_counters(sb);
2051                 return 0;
2052         }
2053
2054 #ifdef CONFIG_EXT4_DEBUG
2055         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2056                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2057                 return JBD2_FC_REPLAY_STOP;
2058         }
2059 #endif
2060
2061         start = (u8 *)bh->b_data;
2062         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2063
2064         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2065                 memcpy(&tl, cur, sizeof(tl));
2066                 val = cur + sizeof(tl);
2067
2068                 if (state->fc_replay_num_tags == 0) {
2069                         ret = JBD2_FC_REPLAY_STOP;
2070                         ext4_fc_set_bitmaps_and_counters(sb);
2071                         break;
2072                 }
2073                 jbd_debug(3, "Replay phase, tag:%s\n",
2074                                 tag2str(le16_to_cpu(tl.fc_tag)));
2075                 state->fc_replay_num_tags--;
2076                 switch (le16_to_cpu(tl.fc_tag)) {
2077                 case EXT4_FC_TAG_LINK:
2078                         ret = ext4_fc_replay_link(sb, &tl, val);
2079                         break;
2080                 case EXT4_FC_TAG_UNLINK:
2081                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2082                         break;
2083                 case EXT4_FC_TAG_ADD_RANGE:
2084                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2085                         break;
2086                 case EXT4_FC_TAG_CREAT:
2087                         ret = ext4_fc_replay_create(sb, &tl, val);
2088                         break;
2089                 case EXT4_FC_TAG_DEL_RANGE:
2090                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2091                         break;
2092                 case EXT4_FC_TAG_INODE:
2093                         ret = ext4_fc_replay_inode(sb, &tl, val);
2094                         break;
2095                 case EXT4_FC_TAG_PAD:
2096                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2097                                              le16_to_cpu(tl.fc_len), 0);
2098                         break;
2099                 case EXT4_FC_TAG_TAIL:
2100                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2101                                              le16_to_cpu(tl.fc_len), 0);
2102                         memcpy(&tail, val, sizeof(tail));
2103                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2104                         break;
2105                 case EXT4_FC_TAG_HEAD:
2106                         break;
2107                 default:
2108                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2109                                              le16_to_cpu(tl.fc_len), 0);
2110                         ret = -ECANCELED;
2111                         break;
2112                 }
2113                 if (ret < 0)
2114                         break;
2115                 ret = JBD2_FC_REPLAY_CONTINUE;
2116         }
2117         return ret;
2118 }
2119
2120 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2121 {
2122         /*
2123          * We set replay callback even if fast commit disabled because we may
2124          * could still have fast commit blocks that need to be replayed even if
2125          * fast commit has now been turned off.
2126          */
2127         journal->j_fc_replay_callback = ext4_fc_replay;
2128         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2129                 return;
2130         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2131 }
2132
2133 static const char *fc_ineligible_reasons[] = {
2134         "Extended attributes changed",
2135         "Cross rename",
2136         "Journal flag changed",
2137         "Insufficient memory",
2138         "Swap boot",
2139         "Resize",
2140         "Dir renamed",
2141         "Falloc range op",
2142         "Data journalling",
2143         "FC Commit Failed"
2144 };
2145
2146 int ext4_fc_info_show(struct seq_file *seq, void *v)
2147 {
2148         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2149         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2150         int i;
2151
2152         if (v != SEQ_START_TOKEN)
2153                 return 0;
2154
2155         seq_printf(seq,
2156                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2157                    stats->fc_num_commits, stats->fc_ineligible_commits,
2158                    stats->fc_numblks,
2159                    div_u64(stats->s_fc_avg_commit_time, 1000));
2160         seq_puts(seq, "Ineligible reasons:\n");
2161         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2162                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2163                         stats->fc_ineligible_reason_count[i]);
2164
2165         return 0;
2166 }
2167
2168 int __init ext4_fc_init_dentry_cache(void)
2169 {
2170         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2171                                            SLAB_RECLAIM_ACCOUNT);
2172
2173         if (ext4_fc_dentry_cachep == NULL)
2174                 return -ENOMEM;
2175
2176         return 0;
2177 }
2178
2179 void ext4_fc_destroy_dentry_cache(void)
2180 {
2181         kmem_cache_destroy(ext4_fc_dentry_cachep);
2182 }