Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / fs / xfs / libxfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_inode.h"
27 #include "xfs_trans.h"
28 #include "xfs_inode_item.h"
29 #include "xfs_buf_item.h"
30 #include "xfs_btree.h"
31 #include "xfs_error.h"
32 #include "xfs_trace.h"
33 #include "xfs_cksum.h"
34 #include "xfs_alloc.h"
35 #include "xfs_log.h"
36
37 /*
38  * Cursor allocation zone.
39  */
40 kmem_zone_t     *xfs_btree_cur_zone;
41
42 /*
43  * Btree magic numbers.
44  */
45 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
46         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
47           XFS_FIBT_MAGIC },
48         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
49           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
50 };
51 #define xfs_btree_magic(cur) \
52         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
53
54
55 STATIC int                              /* error (0 or EFSCORRUPTED) */
56 xfs_btree_check_lblock(
57         struct xfs_btree_cur    *cur,   /* btree cursor */
58         struct xfs_btree_block  *block, /* btree long form block pointer */
59         int                     level,  /* level of the btree block */
60         struct xfs_buf          *bp)    /* buffer for block, if any */
61 {
62         int                     lblock_ok = 1; /* block passes checks */
63         struct xfs_mount        *mp;    /* file system mount point */
64
65         mp = cur->bc_mp;
66
67         if (xfs_sb_version_hascrc(&mp->m_sb)) {
68                 lblock_ok = lblock_ok &&
69                         uuid_equal(&block->bb_u.l.bb_uuid,
70                                    &mp->m_sb.sb_meta_uuid) &&
71                         block->bb_u.l.bb_blkno == cpu_to_be64(
72                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
73         }
74
75         lblock_ok = lblock_ok &&
76                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
77                 be16_to_cpu(block->bb_level) == level &&
78                 be16_to_cpu(block->bb_numrecs) <=
79                         cur->bc_ops->get_maxrecs(cur, level) &&
80                 block->bb_u.l.bb_leftsib &&
81                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
82                  XFS_FSB_SANITY_CHECK(mp,
83                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
84                 block->bb_u.l.bb_rightsib &&
85                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
86                  XFS_FSB_SANITY_CHECK(mp,
87                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
88
89         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
90                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
91                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
92                 if (bp)
93                         trace_xfs_btree_corrupt(bp, _RET_IP_);
94                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
95                 return -EFSCORRUPTED;
96         }
97         return 0;
98 }
99
100 STATIC int                              /* error (0 or EFSCORRUPTED) */
101 xfs_btree_check_sblock(
102         struct xfs_btree_cur    *cur,   /* btree cursor */
103         struct xfs_btree_block  *block, /* btree short form block pointer */
104         int                     level,  /* level of the btree block */
105         struct xfs_buf          *bp)    /* buffer containing block */
106 {
107         struct xfs_mount        *mp;    /* file system mount point */
108         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
109         struct xfs_agf          *agf;   /* ag. freespace structure */
110         xfs_agblock_t           agflen; /* native ag. freespace length */
111         int                     sblock_ok = 1; /* block passes checks */
112
113         mp = cur->bc_mp;
114         agbp = cur->bc_private.a.agbp;
115         agf = XFS_BUF_TO_AGF(agbp);
116         agflen = be32_to_cpu(agf->agf_length);
117
118         if (xfs_sb_version_hascrc(&mp->m_sb)) {
119                 sblock_ok = sblock_ok &&
120                         uuid_equal(&block->bb_u.s.bb_uuid,
121                                    &mp->m_sb.sb_meta_uuid) &&
122                         block->bb_u.s.bb_blkno == cpu_to_be64(
123                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
124         }
125
126         sblock_ok = sblock_ok &&
127                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
128                 be16_to_cpu(block->bb_level) == level &&
129                 be16_to_cpu(block->bb_numrecs) <=
130                         cur->bc_ops->get_maxrecs(cur, level) &&
131                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
132                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
133                 block->bb_u.s.bb_leftsib &&
134                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
135                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
136                 block->bb_u.s.bb_rightsib;
137
138         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
139                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
140                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
141                 if (bp)
142                         trace_xfs_btree_corrupt(bp, _RET_IP_);
143                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
144                 return -EFSCORRUPTED;
145         }
146         return 0;
147 }
148
149 /*
150  * Debug routine: check that block header is ok.
151  */
152 int
153 xfs_btree_check_block(
154         struct xfs_btree_cur    *cur,   /* btree cursor */
155         struct xfs_btree_block  *block, /* generic btree block pointer */
156         int                     level,  /* level of the btree block */
157         struct xfs_buf          *bp)    /* buffer containing block, if any */
158 {
159         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
160                 return xfs_btree_check_lblock(cur, block, level, bp);
161         else
162                 return xfs_btree_check_sblock(cur, block, level, bp);
163 }
164
165 /*
166  * Check that (long) pointer is ok.
167  */
168 int                                     /* error (0 or EFSCORRUPTED) */
169 xfs_btree_check_lptr(
170         struct xfs_btree_cur    *cur,   /* btree cursor */
171         xfs_fsblock_t           bno,    /* btree block disk address */
172         int                     level)  /* btree block level */
173 {
174         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
175                 level > 0 &&
176                 bno != NULLFSBLOCK &&
177                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
178         return 0;
179 }
180
181 #ifdef DEBUG
182 /*
183  * Check that (short) pointer is ok.
184  */
185 STATIC int                              /* error (0 or EFSCORRUPTED) */
186 xfs_btree_check_sptr(
187         struct xfs_btree_cur    *cur,   /* btree cursor */
188         xfs_agblock_t           bno,    /* btree block disk address */
189         int                     level)  /* btree block level */
190 {
191         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
192
193         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
194                 level > 0 &&
195                 bno != NULLAGBLOCK &&
196                 bno != 0 &&
197                 bno < agblocks);
198         return 0;
199 }
200
201 /*
202  * Check that block ptr is ok.
203  */
204 STATIC int                              /* error (0 or EFSCORRUPTED) */
205 xfs_btree_check_ptr(
206         struct xfs_btree_cur    *cur,   /* btree cursor */
207         union xfs_btree_ptr     *ptr,   /* btree block disk address */
208         int                     index,  /* offset from ptr to check */
209         int                     level)  /* btree block level */
210 {
211         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
212                 return xfs_btree_check_lptr(cur,
213                                 be64_to_cpu((&ptr->l)[index]), level);
214         } else {
215                 return xfs_btree_check_sptr(cur,
216                                 be32_to_cpu((&ptr->s)[index]), level);
217         }
218 }
219 #endif
220
221 /*
222  * Calculate CRC on the whole btree block and stuff it into the
223  * long-form btree header.
224  *
225  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
226  * it into the buffer so recovery knows what the last modification was that made
227  * it to disk.
228  */
229 void
230 xfs_btree_lblock_calc_crc(
231         struct xfs_buf          *bp)
232 {
233         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
234         struct xfs_buf_log_item *bip = bp->b_fspriv;
235
236         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
237                 return;
238         if (bip)
239                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
240         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
241 }
242
243 bool
244 xfs_btree_lblock_verify_crc(
245         struct xfs_buf          *bp)
246 {
247         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
248         struct xfs_mount        *mp = bp->b_target->bt_mount;
249
250         if (xfs_sb_version_hascrc(&mp->m_sb)) {
251                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
252                         return false;
253                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
254         }
255
256         return true;
257 }
258
259 /*
260  * Calculate CRC on the whole btree block and stuff it into the
261  * short-form btree header.
262  *
263  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
264  * it into the buffer so recovery knows what the last modification was that made
265  * it to disk.
266  */
267 void
268 xfs_btree_sblock_calc_crc(
269         struct xfs_buf          *bp)
270 {
271         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
272         struct xfs_buf_log_item *bip = bp->b_fspriv;
273
274         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
275                 return;
276         if (bip)
277                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
278         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
279 }
280
281 bool
282 xfs_btree_sblock_verify_crc(
283         struct xfs_buf          *bp)
284 {
285         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
286         struct xfs_mount        *mp = bp->b_target->bt_mount;
287
288         if (xfs_sb_version_hascrc(&mp->m_sb)) {
289                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
290                         return false;
291                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
292         }
293
294         return true;
295 }
296
297 static int
298 xfs_btree_free_block(
299         struct xfs_btree_cur    *cur,
300         struct xfs_buf          *bp)
301 {
302         int                     error;
303
304         error = cur->bc_ops->free_block(cur, bp);
305         if (!error) {
306                 xfs_trans_binval(cur->bc_tp, bp);
307                 XFS_BTREE_STATS_INC(cur, free);
308         }
309         return error;
310 }
311
312 /*
313  * Delete the btree cursor.
314  */
315 void
316 xfs_btree_del_cursor(
317         xfs_btree_cur_t *cur,           /* btree cursor */
318         int             error)          /* del because of error */
319 {
320         int             i;              /* btree level */
321
322         /*
323          * Clear the buffer pointers, and release the buffers.
324          * If we're doing this in the face of an error, we
325          * need to make sure to inspect all of the entries
326          * in the bc_bufs array for buffers to be unlocked.
327          * This is because some of the btree code works from
328          * level n down to 0, and if we get an error along
329          * the way we won't have initialized all the entries
330          * down to 0.
331          */
332         for (i = 0; i < cur->bc_nlevels; i++) {
333                 if (cur->bc_bufs[i])
334                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
335                 else if (!error)
336                         break;
337         }
338         /*
339          * Can't free a bmap cursor without having dealt with the
340          * allocated indirect blocks' accounting.
341          */
342         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
343                cur->bc_private.b.allocated == 0);
344         /*
345          * Free the cursor.
346          */
347         kmem_zone_free(xfs_btree_cur_zone, cur);
348 }
349
350 /*
351  * Duplicate the btree cursor.
352  * Allocate a new one, copy the record, re-get the buffers.
353  */
354 int                                     /* error */
355 xfs_btree_dup_cursor(
356         xfs_btree_cur_t *cur,           /* input cursor */
357         xfs_btree_cur_t **ncur)         /* output cursor */
358 {
359         xfs_buf_t       *bp;            /* btree block's buffer pointer */
360         int             error;          /* error return value */
361         int             i;              /* level number of btree block */
362         xfs_mount_t     *mp;            /* mount structure for filesystem */
363         xfs_btree_cur_t *new;           /* new cursor value */
364         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
365
366         tp = cur->bc_tp;
367         mp = cur->bc_mp;
368
369         /*
370          * Allocate a new cursor like the old one.
371          */
372         new = cur->bc_ops->dup_cursor(cur);
373
374         /*
375          * Copy the record currently in the cursor.
376          */
377         new->bc_rec = cur->bc_rec;
378
379         /*
380          * For each level current, re-get the buffer and copy the ptr value.
381          */
382         for (i = 0; i < new->bc_nlevels; i++) {
383                 new->bc_ptrs[i] = cur->bc_ptrs[i];
384                 new->bc_ra[i] = cur->bc_ra[i];
385                 bp = cur->bc_bufs[i];
386                 if (bp) {
387                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
388                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
389                                                    0, &bp,
390                                                    cur->bc_ops->buf_ops);
391                         if (error) {
392                                 xfs_btree_del_cursor(new, error);
393                                 *ncur = NULL;
394                                 return error;
395                         }
396                 }
397                 new->bc_bufs[i] = bp;
398         }
399         *ncur = new;
400         return 0;
401 }
402
403 /*
404  * XFS btree block layout and addressing:
405  *
406  * There are two types of blocks in the btree: leaf and non-leaf blocks.
407  *
408  * The leaf record start with a header then followed by records containing
409  * the values.  A non-leaf block also starts with the same header, and
410  * then first contains lookup keys followed by an equal number of pointers
411  * to the btree blocks at the previous level.
412  *
413  *              +--------+-------+-------+-------+-------+-------+-------+
414  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
415  *              +--------+-------+-------+-------+-------+-------+-------+
416  *
417  *              +--------+-------+-------+-------+-------+-------+-------+
418  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
419  *              +--------+-------+-------+-------+-------+-------+-------+
420  *
421  * The header is called struct xfs_btree_block for reasons better left unknown
422  * and comes in different versions for short (32bit) and long (64bit) block
423  * pointers.  The record and key structures are defined by the btree instances
424  * and opaque to the btree core.  The block pointers are simple disk endian
425  * integers, available in a short (32bit) and long (64bit) variant.
426  *
427  * The helpers below calculate the offset of a given record, key or pointer
428  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
429  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
430  * inside the btree block is done using indices starting at one, not zero!
431  */
432
433 /*
434  * Return size of the btree block header for this btree instance.
435  */
436 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
437 {
438         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
439                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
440                         return XFS_BTREE_LBLOCK_CRC_LEN;
441                 return XFS_BTREE_LBLOCK_LEN;
442         }
443         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
444                 return XFS_BTREE_SBLOCK_CRC_LEN;
445         return XFS_BTREE_SBLOCK_LEN;
446 }
447
448 /*
449  * Return size of btree block pointers for this btree instance.
450  */
451 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
452 {
453         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
454                 sizeof(__be64) : sizeof(__be32);
455 }
456
457 /*
458  * Calculate offset of the n-th record in a btree block.
459  */
460 STATIC size_t
461 xfs_btree_rec_offset(
462         struct xfs_btree_cur    *cur,
463         int                     n)
464 {
465         return xfs_btree_block_len(cur) +
466                 (n - 1) * cur->bc_ops->rec_len;
467 }
468
469 /*
470  * Calculate offset of the n-th key in a btree block.
471  */
472 STATIC size_t
473 xfs_btree_key_offset(
474         struct xfs_btree_cur    *cur,
475         int                     n)
476 {
477         return xfs_btree_block_len(cur) +
478                 (n - 1) * cur->bc_ops->key_len;
479 }
480
481 /*
482  * Calculate offset of the n-th block pointer in a btree block.
483  */
484 STATIC size_t
485 xfs_btree_ptr_offset(
486         struct xfs_btree_cur    *cur,
487         int                     n,
488         int                     level)
489 {
490         return xfs_btree_block_len(cur) +
491                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
492                 (n - 1) * xfs_btree_ptr_len(cur);
493 }
494
495 /*
496  * Return a pointer to the n-th record in the btree block.
497  */
498 STATIC union xfs_btree_rec *
499 xfs_btree_rec_addr(
500         struct xfs_btree_cur    *cur,
501         int                     n,
502         struct xfs_btree_block  *block)
503 {
504         return (union xfs_btree_rec *)
505                 ((char *)block + xfs_btree_rec_offset(cur, n));
506 }
507
508 /*
509  * Return a pointer to the n-th key in the btree block.
510  */
511 STATIC union xfs_btree_key *
512 xfs_btree_key_addr(
513         struct xfs_btree_cur    *cur,
514         int                     n,
515         struct xfs_btree_block  *block)
516 {
517         return (union xfs_btree_key *)
518                 ((char *)block + xfs_btree_key_offset(cur, n));
519 }
520
521 /*
522  * Return a pointer to the n-th block pointer in the btree block.
523  */
524 STATIC union xfs_btree_ptr *
525 xfs_btree_ptr_addr(
526         struct xfs_btree_cur    *cur,
527         int                     n,
528         struct xfs_btree_block  *block)
529 {
530         int                     level = xfs_btree_get_level(block);
531
532         ASSERT(block->bb_level != 0);
533
534         return (union xfs_btree_ptr *)
535                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
536 }
537
538 /*
539  * Get the root block which is stored in the inode.
540  *
541  * For now this btree implementation assumes the btree root is always
542  * stored in the if_broot field of an inode fork.
543  */
544 STATIC struct xfs_btree_block *
545 xfs_btree_get_iroot(
546         struct xfs_btree_cur    *cur)
547 {
548         struct xfs_ifork        *ifp;
549
550         ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
551         return (struct xfs_btree_block *)ifp->if_broot;
552 }
553
554 /*
555  * Retrieve the block pointer from the cursor at the given level.
556  * This may be an inode btree root or from a buffer.
557  */
558 STATIC struct xfs_btree_block *         /* generic btree block pointer */
559 xfs_btree_get_block(
560         struct xfs_btree_cur    *cur,   /* btree cursor */
561         int                     level,  /* level in btree */
562         struct xfs_buf          **bpp)  /* buffer containing the block */
563 {
564         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
565             (level == cur->bc_nlevels - 1)) {
566                 *bpp = NULL;
567                 return xfs_btree_get_iroot(cur);
568         }
569
570         *bpp = cur->bc_bufs[level];
571         return XFS_BUF_TO_BLOCK(*bpp);
572 }
573
574 /*
575  * Get a buffer for the block, return it with no data read.
576  * Long-form addressing.
577  */
578 xfs_buf_t *                             /* buffer for fsbno */
579 xfs_btree_get_bufl(
580         xfs_mount_t     *mp,            /* file system mount point */
581         xfs_trans_t     *tp,            /* transaction pointer */
582         xfs_fsblock_t   fsbno,          /* file system block number */
583         uint            lock)           /* lock flags for get_buf */
584 {
585         xfs_daddr_t             d;              /* real disk block address */
586
587         ASSERT(fsbno != NULLFSBLOCK);
588         d = XFS_FSB_TO_DADDR(mp, fsbno);
589         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
590 }
591
592 /*
593  * Get a buffer for the block, return it with no data read.
594  * Short-form addressing.
595  */
596 xfs_buf_t *                             /* buffer for agno/agbno */
597 xfs_btree_get_bufs(
598         xfs_mount_t     *mp,            /* file system mount point */
599         xfs_trans_t     *tp,            /* transaction pointer */
600         xfs_agnumber_t  agno,           /* allocation group number */
601         xfs_agblock_t   agbno,          /* allocation group block number */
602         uint            lock)           /* lock flags for get_buf */
603 {
604         xfs_daddr_t             d;              /* real disk block address */
605
606         ASSERT(agno != NULLAGNUMBER);
607         ASSERT(agbno != NULLAGBLOCK);
608         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
609         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
610 }
611
612 /*
613  * Check for the cursor referring to the last block at the given level.
614  */
615 int                                     /* 1=is last block, 0=not last block */
616 xfs_btree_islastblock(
617         xfs_btree_cur_t         *cur,   /* btree cursor */
618         int                     level)  /* level to check */
619 {
620         struct xfs_btree_block  *block; /* generic btree block pointer */
621         xfs_buf_t               *bp;    /* buffer containing block */
622
623         block = xfs_btree_get_block(cur, level, &bp);
624         xfs_btree_check_block(cur, block, level, bp);
625         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
626                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
627         else
628                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
629 }
630
631 /*
632  * Change the cursor to point to the first record at the given level.
633  * Other levels are unaffected.
634  */
635 STATIC int                              /* success=1, failure=0 */
636 xfs_btree_firstrec(
637         xfs_btree_cur_t         *cur,   /* btree cursor */
638         int                     level)  /* level to change */
639 {
640         struct xfs_btree_block  *block; /* generic btree block pointer */
641         xfs_buf_t               *bp;    /* buffer containing block */
642
643         /*
644          * Get the block pointer for this level.
645          */
646         block = xfs_btree_get_block(cur, level, &bp);
647         xfs_btree_check_block(cur, block, level, bp);
648         /*
649          * It's empty, there is no such record.
650          */
651         if (!block->bb_numrecs)
652                 return 0;
653         /*
654          * Set the ptr value to 1, that's the first record/key.
655          */
656         cur->bc_ptrs[level] = 1;
657         return 1;
658 }
659
660 /*
661  * Change the cursor to point to the last record in the current block
662  * at the given level.  Other levels are unaffected.
663  */
664 STATIC int                              /* success=1, failure=0 */
665 xfs_btree_lastrec(
666         xfs_btree_cur_t         *cur,   /* btree cursor */
667         int                     level)  /* level to change */
668 {
669         struct xfs_btree_block  *block; /* generic btree block pointer */
670         xfs_buf_t               *bp;    /* buffer containing block */
671
672         /*
673          * Get the block pointer for this level.
674          */
675         block = xfs_btree_get_block(cur, level, &bp);
676         xfs_btree_check_block(cur, block, level, bp);
677         /*
678          * It's empty, there is no such record.
679          */
680         if (!block->bb_numrecs)
681                 return 0;
682         /*
683          * Set the ptr value to numrecs, that's the last record/key.
684          */
685         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
686         return 1;
687 }
688
689 /*
690  * Compute first and last byte offsets for the fields given.
691  * Interprets the offsets table, which contains struct field offsets.
692  */
693 void
694 xfs_btree_offsets(
695         __int64_t       fields,         /* bitmask of fields */
696         const short     *offsets,       /* table of field offsets */
697         int             nbits,          /* number of bits to inspect */
698         int             *first,         /* output: first byte offset */
699         int             *last)          /* output: last byte offset */
700 {
701         int             i;              /* current bit number */
702         __int64_t       imask;          /* mask for current bit number */
703
704         ASSERT(fields != 0);
705         /*
706          * Find the lowest bit, so the first byte offset.
707          */
708         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
709                 if (imask & fields) {
710                         *first = offsets[i];
711                         break;
712                 }
713         }
714         /*
715          * Find the highest bit, so the last byte offset.
716          */
717         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
718                 if (imask & fields) {
719                         *last = offsets[i + 1] - 1;
720                         break;
721                 }
722         }
723 }
724
725 /*
726  * Get a buffer for the block, return it read in.
727  * Long-form addressing.
728  */
729 int
730 xfs_btree_read_bufl(
731         struct xfs_mount        *mp,            /* file system mount point */
732         struct xfs_trans        *tp,            /* transaction pointer */
733         xfs_fsblock_t           fsbno,          /* file system block number */
734         uint                    lock,           /* lock flags for read_buf */
735         struct xfs_buf          **bpp,          /* buffer for fsbno */
736         int                     refval,         /* ref count value for buffer */
737         const struct xfs_buf_ops *ops)
738 {
739         struct xfs_buf          *bp;            /* return value */
740         xfs_daddr_t             d;              /* real disk block address */
741         int                     error;
742
743         ASSERT(fsbno != NULLFSBLOCK);
744         d = XFS_FSB_TO_DADDR(mp, fsbno);
745         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
746                                    mp->m_bsize, lock, &bp, ops);
747         if (error)
748                 return error;
749         if (bp)
750                 xfs_buf_set_ref(bp, refval);
751         *bpp = bp;
752         return 0;
753 }
754
755 /*
756  * Read-ahead the block, don't wait for it, don't return a buffer.
757  * Long-form addressing.
758  */
759 /* ARGSUSED */
760 void
761 xfs_btree_reada_bufl(
762         struct xfs_mount        *mp,            /* file system mount point */
763         xfs_fsblock_t           fsbno,          /* file system block number */
764         xfs_extlen_t            count,          /* count of filesystem blocks */
765         const struct xfs_buf_ops *ops)
766 {
767         xfs_daddr_t             d;
768
769         ASSERT(fsbno != NULLFSBLOCK);
770         d = XFS_FSB_TO_DADDR(mp, fsbno);
771         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
772 }
773
774 /*
775  * Read-ahead the block, don't wait for it, don't return a buffer.
776  * Short-form addressing.
777  */
778 /* ARGSUSED */
779 void
780 xfs_btree_reada_bufs(
781         struct xfs_mount        *mp,            /* file system mount point */
782         xfs_agnumber_t          agno,           /* allocation group number */
783         xfs_agblock_t           agbno,          /* allocation group block number */
784         xfs_extlen_t            count,          /* count of filesystem blocks */
785         const struct xfs_buf_ops *ops)
786 {
787         xfs_daddr_t             d;
788
789         ASSERT(agno != NULLAGNUMBER);
790         ASSERT(agbno != NULLAGBLOCK);
791         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
792         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
793 }
794
795 STATIC int
796 xfs_btree_readahead_lblock(
797         struct xfs_btree_cur    *cur,
798         int                     lr,
799         struct xfs_btree_block  *block)
800 {
801         int                     rval = 0;
802         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
803         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
804
805         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
806                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
807                                      cur->bc_ops->buf_ops);
808                 rval++;
809         }
810
811         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
812                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
813                                      cur->bc_ops->buf_ops);
814                 rval++;
815         }
816
817         return rval;
818 }
819
820 STATIC int
821 xfs_btree_readahead_sblock(
822         struct xfs_btree_cur    *cur,
823         int                     lr,
824         struct xfs_btree_block *block)
825 {
826         int                     rval = 0;
827         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
828         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
829
830
831         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
832                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
833                                      left, 1, cur->bc_ops->buf_ops);
834                 rval++;
835         }
836
837         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
838                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
839                                      right, 1, cur->bc_ops->buf_ops);
840                 rval++;
841         }
842
843         return rval;
844 }
845
846 /*
847  * Read-ahead btree blocks, at the given level.
848  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
849  */
850 STATIC int
851 xfs_btree_readahead(
852         struct xfs_btree_cur    *cur,           /* btree cursor */
853         int                     lev,            /* level in btree */
854         int                     lr)             /* left/right bits */
855 {
856         struct xfs_btree_block  *block;
857
858         /*
859          * No readahead needed if we are at the root level and the
860          * btree root is stored in the inode.
861          */
862         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
863             (lev == cur->bc_nlevels - 1))
864                 return 0;
865
866         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
867                 return 0;
868
869         cur->bc_ra[lev] |= lr;
870         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
871
872         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
873                 return xfs_btree_readahead_lblock(cur, lr, block);
874         return xfs_btree_readahead_sblock(cur, lr, block);
875 }
876
877 STATIC xfs_daddr_t
878 xfs_btree_ptr_to_daddr(
879         struct xfs_btree_cur    *cur,
880         union xfs_btree_ptr     *ptr)
881 {
882         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
883                 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
884
885                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
886         } else {
887                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
888                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
889
890                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
891                                         be32_to_cpu(ptr->s));
892         }
893 }
894
895 /*
896  * Readahead @count btree blocks at the given @ptr location.
897  *
898  * We don't need to care about long or short form btrees here as we have a
899  * method of converting the ptr directly to a daddr available to us.
900  */
901 STATIC void
902 xfs_btree_readahead_ptr(
903         struct xfs_btree_cur    *cur,
904         union xfs_btree_ptr     *ptr,
905         xfs_extlen_t            count)
906 {
907         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
908                           xfs_btree_ptr_to_daddr(cur, ptr),
909                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
910 }
911
912 /*
913  * Set the buffer for level "lev" in the cursor to bp, releasing
914  * any previous buffer.
915  */
916 STATIC void
917 xfs_btree_setbuf(
918         xfs_btree_cur_t         *cur,   /* btree cursor */
919         int                     lev,    /* level in btree */
920         xfs_buf_t               *bp)    /* new buffer to set */
921 {
922         struct xfs_btree_block  *b;     /* btree block */
923
924         if (cur->bc_bufs[lev])
925                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
926         cur->bc_bufs[lev] = bp;
927         cur->bc_ra[lev] = 0;
928
929         b = XFS_BUF_TO_BLOCK(bp);
930         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
931                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
932                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
933                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
934                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
935         } else {
936                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
937                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
938                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
939                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
940         }
941 }
942
943 STATIC int
944 xfs_btree_ptr_is_null(
945         struct xfs_btree_cur    *cur,
946         union xfs_btree_ptr     *ptr)
947 {
948         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
949                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
950         else
951                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
952 }
953
954 STATIC void
955 xfs_btree_set_ptr_null(
956         struct xfs_btree_cur    *cur,
957         union xfs_btree_ptr     *ptr)
958 {
959         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
960                 ptr->l = cpu_to_be64(NULLFSBLOCK);
961         else
962                 ptr->s = cpu_to_be32(NULLAGBLOCK);
963 }
964
965 /*
966  * Get/set/init sibling pointers
967  */
968 STATIC void
969 xfs_btree_get_sibling(
970         struct xfs_btree_cur    *cur,
971         struct xfs_btree_block  *block,
972         union xfs_btree_ptr     *ptr,
973         int                     lr)
974 {
975         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
976
977         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
978                 if (lr == XFS_BB_RIGHTSIB)
979                         ptr->l = block->bb_u.l.bb_rightsib;
980                 else
981                         ptr->l = block->bb_u.l.bb_leftsib;
982         } else {
983                 if (lr == XFS_BB_RIGHTSIB)
984                         ptr->s = block->bb_u.s.bb_rightsib;
985                 else
986                         ptr->s = block->bb_u.s.bb_leftsib;
987         }
988 }
989
990 STATIC void
991 xfs_btree_set_sibling(
992         struct xfs_btree_cur    *cur,
993         struct xfs_btree_block  *block,
994         union xfs_btree_ptr     *ptr,
995         int                     lr)
996 {
997         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
998
999         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1000                 if (lr == XFS_BB_RIGHTSIB)
1001                         block->bb_u.l.bb_rightsib = ptr->l;
1002                 else
1003                         block->bb_u.l.bb_leftsib = ptr->l;
1004         } else {
1005                 if (lr == XFS_BB_RIGHTSIB)
1006                         block->bb_u.s.bb_rightsib = ptr->s;
1007                 else
1008                         block->bb_u.s.bb_leftsib = ptr->s;
1009         }
1010 }
1011
1012 void
1013 xfs_btree_init_block_int(
1014         struct xfs_mount        *mp,
1015         struct xfs_btree_block  *buf,
1016         xfs_daddr_t             blkno,
1017         __u32                   magic,
1018         __u16                   level,
1019         __u16                   numrecs,
1020         __u64                   owner,
1021         unsigned int            flags)
1022 {
1023         buf->bb_magic = cpu_to_be32(magic);
1024         buf->bb_level = cpu_to_be16(level);
1025         buf->bb_numrecs = cpu_to_be16(numrecs);
1026
1027         if (flags & XFS_BTREE_LONG_PTRS) {
1028                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1029                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1030                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1031                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1032                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1033                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1034                         buf->bb_u.l.bb_pad = 0;
1035                         buf->bb_u.l.bb_lsn = 0;
1036                 }
1037         } else {
1038                 /* owner is a 32 bit value on short blocks */
1039                 __u32 __owner = (__u32)owner;
1040
1041                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1042                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1043                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1044                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1045                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1046                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1047                         buf->bb_u.s.bb_lsn = 0;
1048                 }
1049         }
1050 }
1051
1052 void
1053 xfs_btree_init_block(
1054         struct xfs_mount *mp,
1055         struct xfs_buf  *bp,
1056         __u32           magic,
1057         __u16           level,
1058         __u16           numrecs,
1059         __u64           owner,
1060         unsigned int    flags)
1061 {
1062         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1063                                  magic, level, numrecs, owner, flags);
1064 }
1065
1066 STATIC void
1067 xfs_btree_init_block_cur(
1068         struct xfs_btree_cur    *cur,
1069         struct xfs_buf          *bp,
1070         int                     level,
1071         int                     numrecs)
1072 {
1073         __u64 owner;
1074
1075         /*
1076          * we can pull the owner from the cursor right now as the different
1077          * owners align directly with the pointer size of the btree. This may
1078          * change in future, but is safe for current users of the generic btree
1079          * code.
1080          */
1081         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1082                 owner = cur->bc_private.b.ip->i_ino;
1083         else
1084                 owner = cur->bc_private.a.agno;
1085
1086         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1087                                  xfs_btree_magic(cur), level, numrecs,
1088                                  owner, cur->bc_flags);
1089 }
1090
1091 /*
1092  * Return true if ptr is the last record in the btree and
1093  * we need to track updates to this record.  The decision
1094  * will be further refined in the update_lastrec method.
1095  */
1096 STATIC int
1097 xfs_btree_is_lastrec(
1098         struct xfs_btree_cur    *cur,
1099         struct xfs_btree_block  *block,
1100         int                     level)
1101 {
1102         union xfs_btree_ptr     ptr;
1103
1104         if (level > 0)
1105                 return 0;
1106         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1107                 return 0;
1108
1109         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1110         if (!xfs_btree_ptr_is_null(cur, &ptr))
1111                 return 0;
1112         return 1;
1113 }
1114
1115 STATIC void
1116 xfs_btree_buf_to_ptr(
1117         struct xfs_btree_cur    *cur,
1118         struct xfs_buf          *bp,
1119         union xfs_btree_ptr     *ptr)
1120 {
1121         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1122                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1123                                         XFS_BUF_ADDR(bp)));
1124         else {
1125                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1126                                         XFS_BUF_ADDR(bp)));
1127         }
1128 }
1129
1130 STATIC void
1131 xfs_btree_set_refs(
1132         struct xfs_btree_cur    *cur,
1133         struct xfs_buf          *bp)
1134 {
1135         switch (cur->bc_btnum) {
1136         case XFS_BTNUM_BNO:
1137         case XFS_BTNUM_CNT:
1138                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1139                 break;
1140         case XFS_BTNUM_INO:
1141         case XFS_BTNUM_FINO:
1142                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1143                 break;
1144         case XFS_BTNUM_BMAP:
1145                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1146                 break;
1147         default:
1148                 ASSERT(0);
1149         }
1150 }
1151
1152 STATIC int
1153 xfs_btree_get_buf_block(
1154         struct xfs_btree_cur    *cur,
1155         union xfs_btree_ptr     *ptr,
1156         int                     flags,
1157         struct xfs_btree_block  **block,
1158         struct xfs_buf          **bpp)
1159 {
1160         struct xfs_mount        *mp = cur->bc_mp;
1161         xfs_daddr_t             d;
1162
1163         /* need to sort out how callers deal with failures first */
1164         ASSERT(!(flags & XBF_TRYLOCK));
1165
1166         d = xfs_btree_ptr_to_daddr(cur, ptr);
1167         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1168                                  mp->m_bsize, flags);
1169
1170         if (!*bpp)
1171                 return -ENOMEM;
1172
1173         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1174         *block = XFS_BUF_TO_BLOCK(*bpp);
1175         return 0;
1176 }
1177
1178 /*
1179  * Read in the buffer at the given ptr and return the buffer and
1180  * the block pointer within the buffer.
1181  */
1182 STATIC int
1183 xfs_btree_read_buf_block(
1184         struct xfs_btree_cur    *cur,
1185         union xfs_btree_ptr     *ptr,
1186         int                     flags,
1187         struct xfs_btree_block  **block,
1188         struct xfs_buf          **bpp)
1189 {
1190         struct xfs_mount        *mp = cur->bc_mp;
1191         xfs_daddr_t             d;
1192         int                     error;
1193
1194         /* need to sort out how callers deal with failures first */
1195         ASSERT(!(flags & XBF_TRYLOCK));
1196
1197         d = xfs_btree_ptr_to_daddr(cur, ptr);
1198         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1199                                    mp->m_bsize, flags, bpp,
1200                                    cur->bc_ops->buf_ops);
1201         if (error)
1202                 return error;
1203
1204         xfs_btree_set_refs(cur, *bpp);
1205         *block = XFS_BUF_TO_BLOCK(*bpp);
1206         return 0;
1207 }
1208
1209 /*
1210  * Copy keys from one btree block to another.
1211  */
1212 STATIC void
1213 xfs_btree_copy_keys(
1214         struct xfs_btree_cur    *cur,
1215         union xfs_btree_key     *dst_key,
1216         union xfs_btree_key     *src_key,
1217         int                     numkeys)
1218 {
1219         ASSERT(numkeys >= 0);
1220         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1221 }
1222
1223 /*
1224  * Copy records from one btree block to another.
1225  */
1226 STATIC void
1227 xfs_btree_copy_recs(
1228         struct xfs_btree_cur    *cur,
1229         union xfs_btree_rec     *dst_rec,
1230         union xfs_btree_rec     *src_rec,
1231         int                     numrecs)
1232 {
1233         ASSERT(numrecs >= 0);
1234         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1235 }
1236
1237 /*
1238  * Copy block pointers from one btree block to another.
1239  */
1240 STATIC void
1241 xfs_btree_copy_ptrs(
1242         struct xfs_btree_cur    *cur,
1243         union xfs_btree_ptr     *dst_ptr,
1244         union xfs_btree_ptr     *src_ptr,
1245         int                     numptrs)
1246 {
1247         ASSERT(numptrs >= 0);
1248         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1249 }
1250
1251 /*
1252  * Shift keys one index left/right inside a single btree block.
1253  */
1254 STATIC void
1255 xfs_btree_shift_keys(
1256         struct xfs_btree_cur    *cur,
1257         union xfs_btree_key     *key,
1258         int                     dir,
1259         int                     numkeys)
1260 {
1261         char                    *dst_key;
1262
1263         ASSERT(numkeys >= 0);
1264         ASSERT(dir == 1 || dir == -1);
1265
1266         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1267         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1268 }
1269
1270 /*
1271  * Shift records one index left/right inside a single btree block.
1272  */
1273 STATIC void
1274 xfs_btree_shift_recs(
1275         struct xfs_btree_cur    *cur,
1276         union xfs_btree_rec     *rec,
1277         int                     dir,
1278         int                     numrecs)
1279 {
1280         char                    *dst_rec;
1281
1282         ASSERT(numrecs >= 0);
1283         ASSERT(dir == 1 || dir == -1);
1284
1285         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1286         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1287 }
1288
1289 /*
1290  * Shift block pointers one index left/right inside a single btree block.
1291  */
1292 STATIC void
1293 xfs_btree_shift_ptrs(
1294         struct xfs_btree_cur    *cur,
1295         union xfs_btree_ptr     *ptr,
1296         int                     dir,
1297         int                     numptrs)
1298 {
1299         char                    *dst_ptr;
1300
1301         ASSERT(numptrs >= 0);
1302         ASSERT(dir == 1 || dir == -1);
1303
1304         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1305         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1306 }
1307
1308 /*
1309  * Log key values from the btree block.
1310  */
1311 STATIC void
1312 xfs_btree_log_keys(
1313         struct xfs_btree_cur    *cur,
1314         struct xfs_buf          *bp,
1315         int                     first,
1316         int                     last)
1317 {
1318         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1319         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1320
1321         if (bp) {
1322                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1323                 xfs_trans_log_buf(cur->bc_tp, bp,
1324                                   xfs_btree_key_offset(cur, first),
1325                                   xfs_btree_key_offset(cur, last + 1) - 1);
1326         } else {
1327                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1328                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1329         }
1330
1331         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1332 }
1333
1334 /*
1335  * Log record values from the btree block.
1336  */
1337 void
1338 xfs_btree_log_recs(
1339         struct xfs_btree_cur    *cur,
1340         struct xfs_buf          *bp,
1341         int                     first,
1342         int                     last)
1343 {
1344         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1345         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1346
1347         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1348         xfs_trans_log_buf(cur->bc_tp, bp,
1349                           xfs_btree_rec_offset(cur, first),
1350                           xfs_btree_rec_offset(cur, last + 1) - 1);
1351
1352         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1353 }
1354
1355 /*
1356  * Log block pointer fields from a btree block (nonleaf).
1357  */
1358 STATIC void
1359 xfs_btree_log_ptrs(
1360         struct xfs_btree_cur    *cur,   /* btree cursor */
1361         struct xfs_buf          *bp,    /* buffer containing btree block */
1362         int                     first,  /* index of first pointer to log */
1363         int                     last)   /* index of last pointer to log */
1364 {
1365         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1366         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1367
1368         if (bp) {
1369                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1370                 int                     level = xfs_btree_get_level(block);
1371
1372                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1373                 xfs_trans_log_buf(cur->bc_tp, bp,
1374                                 xfs_btree_ptr_offset(cur, first, level),
1375                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1376         } else {
1377                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1378                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1379         }
1380
1381         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1382 }
1383
1384 /*
1385  * Log fields from a btree block header.
1386  */
1387 void
1388 xfs_btree_log_block(
1389         struct xfs_btree_cur    *cur,   /* btree cursor */
1390         struct xfs_buf          *bp,    /* buffer containing btree block */
1391         int                     fields) /* mask of fields: XFS_BB_... */
1392 {
1393         int                     first;  /* first byte offset logged */
1394         int                     last;   /* last byte offset logged */
1395         static const short      soffsets[] = {  /* table of offsets (short) */
1396                 offsetof(struct xfs_btree_block, bb_magic),
1397                 offsetof(struct xfs_btree_block, bb_level),
1398                 offsetof(struct xfs_btree_block, bb_numrecs),
1399                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1400                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1401                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1402                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1403                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1404                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1405                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1406                 XFS_BTREE_SBLOCK_CRC_LEN
1407         };
1408         static const short      loffsets[] = {  /* table of offsets (long) */
1409                 offsetof(struct xfs_btree_block, bb_magic),
1410                 offsetof(struct xfs_btree_block, bb_level),
1411                 offsetof(struct xfs_btree_block, bb_numrecs),
1412                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1413                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1414                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1415                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1416                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1417                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1418                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1419                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1420                 XFS_BTREE_LBLOCK_CRC_LEN
1421         };
1422
1423         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1424         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1425
1426         if (bp) {
1427                 int nbits;
1428
1429                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1430                         /*
1431                          * We don't log the CRC when updating a btree
1432                          * block but instead recreate it during log
1433                          * recovery.  As the log buffers have checksums
1434                          * of their own this is safe and avoids logging a crc
1435                          * update in a lot of places.
1436                          */
1437                         if (fields == XFS_BB_ALL_BITS)
1438                                 fields = XFS_BB_ALL_BITS_CRC;
1439                         nbits = XFS_BB_NUM_BITS_CRC;
1440                 } else {
1441                         nbits = XFS_BB_NUM_BITS;
1442                 }
1443                 xfs_btree_offsets(fields,
1444                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1445                                         loffsets : soffsets,
1446                                   nbits, &first, &last);
1447                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1448                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1449         } else {
1450                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1451                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1452         }
1453
1454         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1455 }
1456
1457 /*
1458  * Increment cursor by one record at the level.
1459  * For nonzero levels the leaf-ward information is untouched.
1460  */
1461 int                                             /* error */
1462 xfs_btree_increment(
1463         struct xfs_btree_cur    *cur,
1464         int                     level,
1465         int                     *stat)          /* success/failure */
1466 {
1467         struct xfs_btree_block  *block;
1468         union xfs_btree_ptr     ptr;
1469         struct xfs_buf          *bp;
1470         int                     error;          /* error return value */
1471         int                     lev;
1472
1473         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1474         XFS_BTREE_TRACE_ARGI(cur, level);
1475
1476         ASSERT(level < cur->bc_nlevels);
1477
1478         /* Read-ahead to the right at this level. */
1479         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1480
1481         /* Get a pointer to the btree block. */
1482         block = xfs_btree_get_block(cur, level, &bp);
1483
1484 #ifdef DEBUG
1485         error = xfs_btree_check_block(cur, block, level, bp);
1486         if (error)
1487                 goto error0;
1488 #endif
1489
1490         /* We're done if we remain in the block after the increment. */
1491         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1492                 goto out1;
1493
1494         /* Fail if we just went off the right edge of the tree. */
1495         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1496         if (xfs_btree_ptr_is_null(cur, &ptr))
1497                 goto out0;
1498
1499         XFS_BTREE_STATS_INC(cur, increment);
1500
1501         /*
1502          * March up the tree incrementing pointers.
1503          * Stop when we don't go off the right edge of a block.
1504          */
1505         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1506                 block = xfs_btree_get_block(cur, lev, &bp);
1507
1508 #ifdef DEBUG
1509                 error = xfs_btree_check_block(cur, block, lev, bp);
1510                 if (error)
1511                         goto error0;
1512 #endif
1513
1514                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1515                         break;
1516
1517                 /* Read-ahead the right block for the next loop. */
1518                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1519         }
1520
1521         /*
1522          * If we went off the root then we are either seriously
1523          * confused or have the tree root in an inode.
1524          */
1525         if (lev == cur->bc_nlevels) {
1526                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1527                         goto out0;
1528                 ASSERT(0);
1529                 error = -EFSCORRUPTED;
1530                 goto error0;
1531         }
1532         ASSERT(lev < cur->bc_nlevels);
1533
1534         /*
1535          * Now walk back down the tree, fixing up the cursor's buffer
1536          * pointers and key numbers.
1537          */
1538         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1539                 union xfs_btree_ptr     *ptrp;
1540
1541                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1542                 --lev;
1543                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1544                 if (error)
1545                         goto error0;
1546
1547                 xfs_btree_setbuf(cur, lev, bp);
1548                 cur->bc_ptrs[lev] = 1;
1549         }
1550 out1:
1551         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1552         *stat = 1;
1553         return 0;
1554
1555 out0:
1556         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1557         *stat = 0;
1558         return 0;
1559
1560 error0:
1561         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1562         return error;
1563 }
1564
1565 /*
1566  * Decrement cursor by one record at the level.
1567  * For nonzero levels the leaf-ward information is untouched.
1568  */
1569 int                                             /* error */
1570 xfs_btree_decrement(
1571         struct xfs_btree_cur    *cur,
1572         int                     level,
1573         int                     *stat)          /* success/failure */
1574 {
1575         struct xfs_btree_block  *block;
1576         xfs_buf_t               *bp;
1577         int                     error;          /* error return value */
1578         int                     lev;
1579         union xfs_btree_ptr     ptr;
1580
1581         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1582         XFS_BTREE_TRACE_ARGI(cur, level);
1583
1584         ASSERT(level < cur->bc_nlevels);
1585
1586         /* Read-ahead to the left at this level. */
1587         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1588
1589         /* We're done if we remain in the block after the decrement. */
1590         if (--cur->bc_ptrs[level] > 0)
1591                 goto out1;
1592
1593         /* Get a pointer to the btree block. */
1594         block = xfs_btree_get_block(cur, level, &bp);
1595
1596 #ifdef DEBUG
1597         error = xfs_btree_check_block(cur, block, level, bp);
1598         if (error)
1599                 goto error0;
1600 #endif
1601
1602         /* Fail if we just went off the left edge of the tree. */
1603         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1604         if (xfs_btree_ptr_is_null(cur, &ptr))
1605                 goto out0;
1606
1607         XFS_BTREE_STATS_INC(cur, decrement);
1608
1609         /*
1610          * March up the tree decrementing pointers.
1611          * Stop when we don't go off the left edge of a block.
1612          */
1613         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1614                 if (--cur->bc_ptrs[lev] > 0)
1615                         break;
1616                 /* Read-ahead the left block for the next loop. */
1617                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1618         }
1619
1620         /*
1621          * If we went off the root then we are seriously confused.
1622          * or the root of the tree is in an inode.
1623          */
1624         if (lev == cur->bc_nlevels) {
1625                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1626                         goto out0;
1627                 ASSERT(0);
1628                 error = -EFSCORRUPTED;
1629                 goto error0;
1630         }
1631         ASSERT(lev < cur->bc_nlevels);
1632
1633         /*
1634          * Now walk back down the tree, fixing up the cursor's buffer
1635          * pointers and key numbers.
1636          */
1637         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1638                 union xfs_btree_ptr     *ptrp;
1639
1640                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1641                 --lev;
1642                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1643                 if (error)
1644                         goto error0;
1645                 xfs_btree_setbuf(cur, lev, bp);
1646                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1647         }
1648 out1:
1649         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1650         *stat = 1;
1651         return 0;
1652
1653 out0:
1654         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1655         *stat = 0;
1656         return 0;
1657
1658 error0:
1659         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1660         return error;
1661 }
1662
1663 STATIC int
1664 xfs_btree_lookup_get_block(
1665         struct xfs_btree_cur    *cur,   /* btree cursor */
1666         int                     level,  /* level in the btree */
1667         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1668         struct xfs_btree_block  **blkp) /* return btree block */
1669 {
1670         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1671         int                     error = 0;
1672
1673         /* special case the root block if in an inode */
1674         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1675             (level == cur->bc_nlevels - 1)) {
1676                 *blkp = xfs_btree_get_iroot(cur);
1677                 return 0;
1678         }
1679
1680         /*
1681          * If the old buffer at this level for the disk address we are
1682          * looking for re-use it.
1683          *
1684          * Otherwise throw it away and get a new one.
1685          */
1686         bp = cur->bc_bufs[level];
1687         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1688                 *blkp = XFS_BUF_TO_BLOCK(bp);
1689                 return 0;
1690         }
1691
1692         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1693         if (error)
1694                 return error;
1695
1696         xfs_btree_setbuf(cur, level, bp);
1697         return 0;
1698 }
1699
1700 /*
1701  * Get current search key.  For level 0 we don't actually have a key
1702  * structure so we make one up from the record.  For all other levels
1703  * we just return the right key.
1704  */
1705 STATIC union xfs_btree_key *
1706 xfs_lookup_get_search_key(
1707         struct xfs_btree_cur    *cur,
1708         int                     level,
1709         int                     keyno,
1710         struct xfs_btree_block  *block,
1711         union xfs_btree_key     *kp)
1712 {
1713         if (level == 0) {
1714                 cur->bc_ops->init_key_from_rec(kp,
1715                                 xfs_btree_rec_addr(cur, keyno, block));
1716                 return kp;
1717         }
1718
1719         return xfs_btree_key_addr(cur, keyno, block);
1720 }
1721
1722 /*
1723  * Lookup the record.  The cursor is made to point to it, based on dir.
1724  * stat is set to 0 if can't find any such record, 1 for success.
1725  */
1726 int                                     /* error */
1727 xfs_btree_lookup(
1728         struct xfs_btree_cur    *cur,   /* btree cursor */
1729         xfs_lookup_t            dir,    /* <=, ==, or >= */
1730         int                     *stat)  /* success/failure */
1731 {
1732         struct xfs_btree_block  *block; /* current btree block */
1733         __int64_t               diff;   /* difference for the current key */
1734         int                     error;  /* error return value */
1735         int                     keyno;  /* current key number */
1736         int                     level;  /* level in the btree */
1737         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1738         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1739
1740         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1741         XFS_BTREE_TRACE_ARGI(cur, dir);
1742
1743         XFS_BTREE_STATS_INC(cur, lookup);
1744
1745         block = NULL;
1746         keyno = 0;
1747
1748         /* initialise start pointer from cursor */
1749         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1750         pp = &ptr;
1751
1752         /*
1753          * Iterate over each level in the btree, starting at the root.
1754          * For each level above the leaves, find the key we need, based
1755          * on the lookup record, then follow the corresponding block
1756          * pointer down to the next level.
1757          */
1758         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1759                 /* Get the block we need to do the lookup on. */
1760                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1761                 if (error)
1762                         goto error0;
1763
1764                 if (diff == 0) {
1765                         /*
1766                          * If we already had a key match at a higher level, we
1767                          * know we need to use the first entry in this block.
1768                          */
1769                         keyno = 1;
1770                 } else {
1771                         /* Otherwise search this block. Do a binary search. */
1772
1773                         int     high;   /* high entry number */
1774                         int     low;    /* low entry number */
1775
1776                         /* Set low and high entry numbers, 1-based. */
1777                         low = 1;
1778                         high = xfs_btree_get_numrecs(block);
1779                         if (!high) {
1780                                 /* Block is empty, must be an empty leaf. */
1781                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1782
1783                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1784                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1785                                 *stat = 0;
1786                                 return 0;
1787                         }
1788
1789                         /* Binary search the block. */
1790                         while (low <= high) {
1791                                 union xfs_btree_key     key;
1792                                 union xfs_btree_key     *kp;
1793
1794                                 XFS_BTREE_STATS_INC(cur, compare);
1795
1796                                 /* keyno is average of low and high. */
1797                                 keyno = (low + high) >> 1;
1798
1799                                 /* Get current search key */
1800                                 kp = xfs_lookup_get_search_key(cur, level,
1801                                                 keyno, block, &key);
1802
1803                                 /*
1804                                  * Compute difference to get next direction:
1805                                  *  - less than, move right
1806                                  *  - greater than, move left
1807                                  *  - equal, we're done
1808                                  */
1809                                 diff = cur->bc_ops->key_diff(cur, kp);
1810                                 if (diff < 0)
1811                                         low = keyno + 1;
1812                                 else if (diff > 0)
1813                                         high = keyno - 1;
1814                                 else
1815                                         break;
1816                         }
1817                 }
1818
1819                 /*
1820                  * If there are more levels, set up for the next level
1821                  * by getting the block number and filling in the cursor.
1822                  */
1823                 if (level > 0) {
1824                         /*
1825                          * If we moved left, need the previous key number,
1826                          * unless there isn't one.
1827                          */
1828                         if (diff > 0 && --keyno < 1)
1829                                 keyno = 1;
1830                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1831
1832 #ifdef DEBUG
1833                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1834                         if (error)
1835                                 goto error0;
1836 #endif
1837                         cur->bc_ptrs[level] = keyno;
1838                 }
1839         }
1840
1841         /* Done with the search. See if we need to adjust the results. */
1842         if (dir != XFS_LOOKUP_LE && diff < 0) {
1843                 keyno++;
1844                 /*
1845                  * If ge search and we went off the end of the block, but it's
1846                  * not the last block, we're in the wrong block.
1847                  */
1848                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1849                 if (dir == XFS_LOOKUP_GE &&
1850                     keyno > xfs_btree_get_numrecs(block) &&
1851                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1852                         int     i;
1853
1854                         cur->bc_ptrs[0] = keyno;
1855                         error = xfs_btree_increment(cur, 0, &i);
1856                         if (error)
1857                                 goto error0;
1858                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1859                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1860                         *stat = 1;
1861                         return 0;
1862                 }
1863         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1864                 keyno--;
1865         cur->bc_ptrs[0] = keyno;
1866
1867         /* Return if we succeeded or not. */
1868         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1869                 *stat = 0;
1870         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1871                 *stat = 1;
1872         else
1873                 *stat = 0;
1874         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1875         return 0;
1876
1877 error0:
1878         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1879         return error;
1880 }
1881
1882 /*
1883  * Update keys at all levels from here to the root along the cursor's path.
1884  */
1885 STATIC int
1886 xfs_btree_updkey(
1887         struct xfs_btree_cur    *cur,
1888         union xfs_btree_key     *keyp,
1889         int                     level)
1890 {
1891         struct xfs_btree_block  *block;
1892         struct xfs_buf          *bp;
1893         union xfs_btree_key     *kp;
1894         int                     ptr;
1895
1896         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1897         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1898
1899         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1900
1901         /*
1902          * Go up the tree from this level toward the root.
1903          * At each level, update the key value to the value input.
1904          * Stop when we reach a level where the cursor isn't pointing
1905          * at the first entry in the block.
1906          */
1907         for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1908 #ifdef DEBUG
1909                 int             error;
1910 #endif
1911                 block = xfs_btree_get_block(cur, level, &bp);
1912 #ifdef DEBUG
1913                 error = xfs_btree_check_block(cur, block, level, bp);
1914                 if (error) {
1915                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1916                         return error;
1917                 }
1918 #endif
1919                 ptr = cur->bc_ptrs[level];
1920                 kp = xfs_btree_key_addr(cur, ptr, block);
1921                 xfs_btree_copy_keys(cur, kp, keyp, 1);
1922                 xfs_btree_log_keys(cur, bp, ptr, ptr);
1923         }
1924
1925         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1926         return 0;
1927 }
1928
1929 /*
1930  * Update the record referred to by cur to the value in the
1931  * given record. This either works (return 0) or gets an
1932  * EFSCORRUPTED error.
1933  */
1934 int
1935 xfs_btree_update(
1936         struct xfs_btree_cur    *cur,
1937         union xfs_btree_rec     *rec)
1938 {
1939         struct xfs_btree_block  *block;
1940         struct xfs_buf          *bp;
1941         int                     error;
1942         int                     ptr;
1943         union xfs_btree_rec     *rp;
1944
1945         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1946         XFS_BTREE_TRACE_ARGR(cur, rec);
1947
1948         /* Pick up the current block. */
1949         block = xfs_btree_get_block(cur, 0, &bp);
1950
1951 #ifdef DEBUG
1952         error = xfs_btree_check_block(cur, block, 0, bp);
1953         if (error)
1954                 goto error0;
1955 #endif
1956         /* Get the address of the rec to be updated. */
1957         ptr = cur->bc_ptrs[0];
1958         rp = xfs_btree_rec_addr(cur, ptr, block);
1959
1960         /* Fill in the new contents and log them. */
1961         xfs_btree_copy_recs(cur, rp, rec, 1);
1962         xfs_btree_log_recs(cur, bp, ptr, ptr);
1963
1964         /*
1965          * If we are tracking the last record in the tree and
1966          * we are at the far right edge of the tree, update it.
1967          */
1968         if (xfs_btree_is_lastrec(cur, block, 0)) {
1969                 cur->bc_ops->update_lastrec(cur, block, rec,
1970                                             ptr, LASTREC_UPDATE);
1971         }
1972
1973         /* Updating first rec in leaf. Pass new key value up to our parent. */
1974         if (ptr == 1) {
1975                 union xfs_btree_key     key;
1976
1977                 cur->bc_ops->init_key_from_rec(&key, rec);
1978                 error = xfs_btree_updkey(cur, &key, 1);
1979                 if (error)
1980                         goto error0;
1981         }
1982
1983         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1984         return 0;
1985
1986 error0:
1987         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1988         return error;
1989 }
1990
1991 /*
1992  * Move 1 record left from cur/level if possible.
1993  * Update cur to reflect the new path.
1994  */
1995 STATIC int                                      /* error */
1996 xfs_btree_lshift(
1997         struct xfs_btree_cur    *cur,
1998         int                     level,
1999         int                     *stat)          /* success/failure */
2000 {
2001         union xfs_btree_key     key;            /* btree key */
2002         struct xfs_buf          *lbp;           /* left buffer pointer */
2003         struct xfs_btree_block  *left;          /* left btree block */
2004         int                     lrecs;          /* left record count */
2005         struct xfs_buf          *rbp;           /* right buffer pointer */
2006         struct xfs_btree_block  *right;         /* right btree block */
2007         int                     rrecs;          /* right record count */
2008         union xfs_btree_ptr     lptr;           /* left btree pointer */
2009         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2010         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2011         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2012         int                     error;          /* error return value */
2013
2014         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2015         XFS_BTREE_TRACE_ARGI(cur, level);
2016
2017         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2018             level == cur->bc_nlevels - 1)
2019                 goto out0;
2020
2021         /* Set up variables for this block as "right". */
2022         right = xfs_btree_get_block(cur, level, &rbp);
2023
2024 #ifdef DEBUG
2025         error = xfs_btree_check_block(cur, right, level, rbp);
2026         if (error)
2027                 goto error0;
2028 #endif
2029
2030         /* If we've got no left sibling then we can't shift an entry left. */
2031         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2032         if (xfs_btree_ptr_is_null(cur, &lptr))
2033                 goto out0;
2034
2035         /*
2036          * If the cursor entry is the one that would be moved, don't
2037          * do it... it's too complicated.
2038          */
2039         if (cur->bc_ptrs[level] <= 1)
2040                 goto out0;
2041
2042         /* Set up the left neighbor as "left". */
2043         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2044         if (error)
2045                 goto error0;
2046
2047         /* If it's full, it can't take another entry. */
2048         lrecs = xfs_btree_get_numrecs(left);
2049         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2050                 goto out0;
2051
2052         rrecs = xfs_btree_get_numrecs(right);
2053
2054         /*
2055          * We add one entry to the left side and remove one for the right side.
2056          * Account for it here, the changes will be updated on disk and logged
2057          * later.
2058          */
2059         lrecs++;
2060         rrecs--;
2061
2062         XFS_BTREE_STATS_INC(cur, lshift);
2063         XFS_BTREE_STATS_ADD(cur, moves, 1);
2064
2065         /*
2066          * If non-leaf, copy a key and a ptr to the left block.
2067          * Log the changes to the left block.
2068          */
2069         if (level > 0) {
2070                 /* It's a non-leaf.  Move keys and pointers. */
2071                 union xfs_btree_key     *lkp;   /* left btree key */
2072                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2073
2074                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2075                 rkp = xfs_btree_key_addr(cur, 1, right);
2076
2077                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2078                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2079 #ifdef DEBUG
2080                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2081                 if (error)
2082                         goto error0;
2083 #endif
2084                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2085                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2086
2087                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2088                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2089
2090                 ASSERT(cur->bc_ops->keys_inorder(cur,
2091                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2092         } else {
2093                 /* It's a leaf.  Move records.  */
2094                 union xfs_btree_rec     *lrp;   /* left record pointer */
2095
2096                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2097                 rrp = xfs_btree_rec_addr(cur, 1, right);
2098
2099                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2100                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2101
2102                 ASSERT(cur->bc_ops->recs_inorder(cur,
2103                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2104         }
2105
2106         xfs_btree_set_numrecs(left, lrecs);
2107         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2108
2109         xfs_btree_set_numrecs(right, rrecs);
2110         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2111
2112         /*
2113          * Slide the contents of right down one entry.
2114          */
2115         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2116         if (level > 0) {
2117                 /* It's a nonleaf. operate on keys and ptrs */
2118 #ifdef DEBUG
2119                 int                     i;              /* loop index */
2120
2121                 for (i = 0; i < rrecs; i++) {
2122                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2123                         if (error)
2124                                 goto error0;
2125                 }
2126 #endif
2127                 xfs_btree_shift_keys(cur,
2128                                 xfs_btree_key_addr(cur, 2, right),
2129                                 -1, rrecs);
2130                 xfs_btree_shift_ptrs(cur,
2131                                 xfs_btree_ptr_addr(cur, 2, right),
2132                                 -1, rrecs);
2133
2134                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2135                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2136         } else {
2137                 /* It's a leaf. operate on records */
2138                 xfs_btree_shift_recs(cur,
2139                         xfs_btree_rec_addr(cur, 2, right),
2140                         -1, rrecs);
2141                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2142
2143                 /*
2144                  * If it's the first record in the block, we'll need a key
2145                  * structure to pass up to the next level (updkey).
2146                  */
2147                 cur->bc_ops->init_key_from_rec(&key,
2148                         xfs_btree_rec_addr(cur, 1, right));
2149                 rkp = &key;
2150         }
2151
2152         /* Update the parent key values of right. */
2153         error = xfs_btree_updkey(cur, rkp, level + 1);
2154         if (error)
2155                 goto error0;
2156
2157         /* Slide the cursor value left one. */
2158         cur->bc_ptrs[level]--;
2159
2160         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2161         *stat = 1;
2162         return 0;
2163
2164 out0:
2165         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2166         *stat = 0;
2167         return 0;
2168
2169 error0:
2170         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2171         return error;
2172 }
2173
2174 /*
2175  * Move 1 record right from cur/level if possible.
2176  * Update cur to reflect the new path.
2177  */
2178 STATIC int                                      /* error */
2179 xfs_btree_rshift(
2180         struct xfs_btree_cur    *cur,
2181         int                     level,
2182         int                     *stat)          /* success/failure */
2183 {
2184         union xfs_btree_key     key;            /* btree key */
2185         struct xfs_buf          *lbp;           /* left buffer pointer */
2186         struct xfs_btree_block  *left;          /* left btree block */
2187         struct xfs_buf          *rbp;           /* right buffer pointer */
2188         struct xfs_btree_block  *right;         /* right btree block */
2189         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2190         union xfs_btree_ptr     rptr;           /* right block pointer */
2191         union xfs_btree_key     *rkp;           /* right btree key */
2192         int                     rrecs;          /* right record count */
2193         int                     lrecs;          /* left record count */
2194         int                     error;          /* error return value */
2195         int                     i;              /* loop counter */
2196
2197         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2198         XFS_BTREE_TRACE_ARGI(cur, level);
2199
2200         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2201             (level == cur->bc_nlevels - 1))
2202                 goto out0;
2203
2204         /* Set up variables for this block as "left". */
2205         left = xfs_btree_get_block(cur, level, &lbp);
2206
2207 #ifdef DEBUG
2208         error = xfs_btree_check_block(cur, left, level, lbp);
2209         if (error)
2210                 goto error0;
2211 #endif
2212
2213         /* If we've got no right sibling then we can't shift an entry right. */
2214         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2215         if (xfs_btree_ptr_is_null(cur, &rptr))
2216                 goto out0;
2217
2218         /*
2219          * If the cursor entry is the one that would be moved, don't
2220          * do it... it's too complicated.
2221          */
2222         lrecs = xfs_btree_get_numrecs(left);
2223         if (cur->bc_ptrs[level] >= lrecs)
2224                 goto out0;
2225
2226         /* Set up the right neighbor as "right". */
2227         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2228         if (error)
2229                 goto error0;
2230
2231         /* If it's full, it can't take another entry. */
2232         rrecs = xfs_btree_get_numrecs(right);
2233         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2234                 goto out0;
2235
2236         XFS_BTREE_STATS_INC(cur, rshift);
2237         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2238
2239         /*
2240          * Make a hole at the start of the right neighbor block, then
2241          * copy the last left block entry to the hole.
2242          */
2243         if (level > 0) {
2244                 /* It's a nonleaf. make a hole in the keys and ptrs */
2245                 union xfs_btree_key     *lkp;
2246                 union xfs_btree_ptr     *lpp;
2247                 union xfs_btree_ptr     *rpp;
2248
2249                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2250                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2251                 rkp = xfs_btree_key_addr(cur, 1, right);
2252                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2253
2254 #ifdef DEBUG
2255                 for (i = rrecs - 1; i >= 0; i--) {
2256                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2257                         if (error)
2258                                 goto error0;
2259                 }
2260 #endif
2261
2262                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2263                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2264
2265 #ifdef DEBUG
2266                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2267                 if (error)
2268                         goto error0;
2269 #endif
2270
2271                 /* Now put the new data in, and log it. */
2272                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2273                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2274
2275                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2276                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2277
2278                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2279                         xfs_btree_key_addr(cur, 2, right)));
2280         } else {
2281                 /* It's a leaf. make a hole in the records */
2282                 union xfs_btree_rec     *lrp;
2283                 union xfs_btree_rec     *rrp;
2284
2285                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2286                 rrp = xfs_btree_rec_addr(cur, 1, right);
2287
2288                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2289
2290                 /* Now put the new data in, and log it. */
2291                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2292                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2293
2294                 cur->bc_ops->init_key_from_rec(&key, rrp);
2295                 rkp = &key;
2296
2297                 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2298                         xfs_btree_rec_addr(cur, 2, right)));
2299         }
2300
2301         /*
2302          * Decrement and log left's numrecs, bump and log right's numrecs.
2303          */
2304         xfs_btree_set_numrecs(left, --lrecs);
2305         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2306
2307         xfs_btree_set_numrecs(right, ++rrecs);
2308         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2309
2310         /*
2311          * Using a temporary cursor, update the parent key values of the
2312          * block on the right.
2313          */
2314         error = xfs_btree_dup_cursor(cur, &tcur);
2315         if (error)
2316                 goto error0;
2317         i = xfs_btree_lastrec(tcur, level);
2318         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2319
2320         error = xfs_btree_increment(tcur, level, &i);
2321         if (error)
2322                 goto error1;
2323
2324         error = xfs_btree_updkey(tcur, rkp, level + 1);
2325         if (error)
2326                 goto error1;
2327
2328         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2329
2330         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2331         *stat = 1;
2332         return 0;
2333
2334 out0:
2335         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2336         *stat = 0;
2337         return 0;
2338
2339 error0:
2340         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2341         return error;
2342
2343 error1:
2344         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2345         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2346         return error;
2347 }
2348
2349 /*
2350  * Split cur/level block in half.
2351  * Return new block number and the key to its first
2352  * record (to be inserted into parent).
2353  */
2354 STATIC int                                      /* error */
2355 __xfs_btree_split(
2356         struct xfs_btree_cur    *cur,
2357         int                     level,
2358         union xfs_btree_ptr     *ptrp,
2359         union xfs_btree_key     *key,
2360         struct xfs_btree_cur    **curp,
2361         int                     *stat)          /* success/failure */
2362 {
2363         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2364         struct xfs_buf          *lbp;           /* left buffer pointer */
2365         struct xfs_btree_block  *left;          /* left btree block */
2366         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2367         struct xfs_buf          *rbp;           /* right buffer pointer */
2368         struct xfs_btree_block  *right;         /* right btree block */
2369         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2370         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2371         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2372         int                     lrecs;
2373         int                     rrecs;
2374         int                     src_index;
2375         int                     error;          /* error return value */
2376 #ifdef DEBUG
2377         int                     i;
2378 #endif
2379
2380         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2381         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2382
2383         XFS_BTREE_STATS_INC(cur, split);
2384
2385         /* Set up left block (current one). */
2386         left = xfs_btree_get_block(cur, level, &lbp);
2387
2388 #ifdef DEBUG
2389         error = xfs_btree_check_block(cur, left, level, lbp);
2390         if (error)
2391                 goto error0;
2392 #endif
2393
2394         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2395
2396         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2397         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2398         if (error)
2399                 goto error0;
2400         if (*stat == 0)
2401                 goto out0;
2402         XFS_BTREE_STATS_INC(cur, alloc);
2403
2404         /* Set up the new block as "right". */
2405         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2406         if (error)
2407                 goto error0;
2408
2409         /* Fill in the btree header for the new right block. */
2410         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2411
2412         /*
2413          * Split the entries between the old and the new block evenly.
2414          * Make sure that if there's an odd number of entries now, that
2415          * each new block will have the same number of entries.
2416          */
2417         lrecs = xfs_btree_get_numrecs(left);
2418         rrecs = lrecs / 2;
2419         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2420                 rrecs++;
2421         src_index = (lrecs - rrecs + 1);
2422
2423         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2424
2425         /*
2426          * Copy btree block entries from the left block over to the
2427          * new block, the right. Update the right block and log the
2428          * changes.
2429          */
2430         if (level > 0) {
2431                 /* It's a non-leaf.  Move keys and pointers. */
2432                 union xfs_btree_key     *lkp;   /* left btree key */
2433                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2434                 union xfs_btree_key     *rkp;   /* right btree key */
2435                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2436
2437                 lkp = xfs_btree_key_addr(cur, src_index, left);
2438                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2439                 rkp = xfs_btree_key_addr(cur, 1, right);
2440                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2441
2442 #ifdef DEBUG
2443                 for (i = src_index; i < rrecs; i++) {
2444                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2445                         if (error)
2446                                 goto error0;
2447                 }
2448 #endif
2449
2450                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2451                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2452
2453                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2454                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2455
2456                 /* Grab the keys to the entries moved to the right block */
2457                 xfs_btree_copy_keys(cur, key, rkp, 1);
2458         } else {
2459                 /* It's a leaf.  Move records.  */
2460                 union xfs_btree_rec     *lrp;   /* left record pointer */
2461                 union xfs_btree_rec     *rrp;   /* right record pointer */
2462
2463                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2464                 rrp = xfs_btree_rec_addr(cur, 1, right);
2465
2466                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2467                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2468
2469                 cur->bc_ops->init_key_from_rec(key,
2470                         xfs_btree_rec_addr(cur, 1, right));
2471         }
2472
2473
2474         /*
2475          * Find the left block number by looking in the buffer.
2476          * Adjust numrecs, sibling pointers.
2477          */
2478         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2479         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2480         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2481         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2482
2483         lrecs -= rrecs;
2484         xfs_btree_set_numrecs(left, lrecs);
2485         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2486
2487         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2488         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2489
2490         /*
2491          * If there's a block to the new block's right, make that block
2492          * point back to right instead of to left.
2493          */
2494         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2495                 error = xfs_btree_read_buf_block(cur, &rrptr,
2496                                                         0, &rrblock, &rrbp);
2497                 if (error)
2498                         goto error0;
2499                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2500                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2501         }
2502         /*
2503          * If the cursor is really in the right block, move it there.
2504          * If it's just pointing past the last entry in left, then we'll
2505          * insert there, so don't change anything in that case.
2506          */
2507         if (cur->bc_ptrs[level] > lrecs + 1) {
2508                 xfs_btree_setbuf(cur, level, rbp);
2509                 cur->bc_ptrs[level] -= lrecs;
2510         }
2511         /*
2512          * If there are more levels, we'll need another cursor which refers
2513          * the right block, no matter where this cursor was.
2514          */
2515         if (level + 1 < cur->bc_nlevels) {
2516                 error = xfs_btree_dup_cursor(cur, curp);
2517                 if (error)
2518                         goto error0;
2519                 (*curp)->bc_ptrs[level + 1]++;
2520         }
2521         *ptrp = rptr;
2522         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2523         *stat = 1;
2524         return 0;
2525 out0:
2526         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2527         *stat = 0;
2528         return 0;
2529
2530 error0:
2531         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2532         return error;
2533 }
2534
2535 struct xfs_btree_split_args {
2536         struct xfs_btree_cur    *cur;
2537         int                     level;
2538         union xfs_btree_ptr     *ptrp;
2539         union xfs_btree_key     *key;
2540         struct xfs_btree_cur    **curp;
2541         int                     *stat;          /* success/failure */
2542         int                     result;
2543         bool                    kswapd; /* allocation in kswapd context */
2544         struct completion       *done;
2545         struct work_struct      work;
2546 };
2547
2548 /*
2549  * Stack switching interfaces for allocation
2550  */
2551 static void
2552 xfs_btree_split_worker(
2553         struct work_struct      *work)
2554 {
2555         struct xfs_btree_split_args     *args = container_of(work,
2556                                                 struct xfs_btree_split_args, work);
2557         unsigned long           pflags;
2558         unsigned long           new_pflags = PF_FSTRANS;
2559
2560         /*
2561          * we are in a transaction context here, but may also be doing work
2562          * in kswapd context, and hence we may need to inherit that state
2563          * temporarily to ensure that we don't block waiting for memory reclaim
2564          * in any way.
2565          */
2566         if (args->kswapd)
2567                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2568
2569         current_set_flags_nested(&pflags, new_pflags);
2570
2571         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2572                                          args->key, args->curp, args->stat);
2573         complete(args->done);
2574
2575         current_restore_flags_nested(&pflags, new_pflags);
2576 }
2577
2578 /*
2579  * BMBT split requests often come in with little stack to work on. Push
2580  * them off to a worker thread so there is lots of stack to use. For the other
2581  * btree types, just call directly to avoid the context switch overhead here.
2582  */
2583 STATIC int                                      /* error */
2584 xfs_btree_split(
2585         struct xfs_btree_cur    *cur,
2586         int                     level,
2587         union xfs_btree_ptr     *ptrp,
2588         union xfs_btree_key     *key,
2589         struct xfs_btree_cur    **curp,
2590         int                     *stat)          /* success/failure */
2591 {
2592         struct xfs_btree_split_args     args;
2593         DECLARE_COMPLETION_ONSTACK(done);
2594
2595         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2596                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2597
2598         args.cur = cur;
2599         args.level = level;
2600         args.ptrp = ptrp;
2601         args.key = key;
2602         args.curp = curp;
2603         args.stat = stat;
2604         args.done = &done;
2605         args.kswapd = current_is_kswapd();
2606         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2607         queue_work(xfs_alloc_wq, &args.work);
2608         wait_for_completion(&done);
2609         destroy_work_on_stack(&args.work);
2610         return args.result;
2611 }
2612
2613
2614 /*
2615  * Copy the old inode root contents into a real block and make the
2616  * broot point to it.
2617  */
2618 int                                             /* error */
2619 xfs_btree_new_iroot(
2620         struct xfs_btree_cur    *cur,           /* btree cursor */
2621         int                     *logflags,      /* logging flags for inode */
2622         int                     *stat)          /* return status - 0 fail */
2623 {
2624         struct xfs_buf          *cbp;           /* buffer for cblock */
2625         struct xfs_btree_block  *block;         /* btree block */
2626         struct xfs_btree_block  *cblock;        /* child btree block */
2627         union xfs_btree_key     *ckp;           /* child key pointer */
2628         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2629         union xfs_btree_key     *kp;            /* pointer to btree key */
2630         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2631         union xfs_btree_ptr     nptr;           /* new block addr */
2632         int                     level;          /* btree level */
2633         int                     error;          /* error return code */
2634 #ifdef DEBUG
2635         int                     i;              /* loop counter */
2636 #endif
2637
2638         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2639         XFS_BTREE_STATS_INC(cur, newroot);
2640
2641         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2642
2643         level = cur->bc_nlevels - 1;
2644
2645         block = xfs_btree_get_iroot(cur);
2646         pp = xfs_btree_ptr_addr(cur, 1, block);
2647
2648         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2649         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2650         if (error)
2651                 goto error0;
2652         if (*stat == 0) {
2653                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2654                 return 0;
2655         }
2656         XFS_BTREE_STATS_INC(cur, alloc);
2657
2658         /* Copy the root into a real block. */
2659         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2660         if (error)
2661                 goto error0;
2662
2663         /*
2664          * we can't just memcpy() the root in for CRC enabled btree blocks.
2665          * In that case have to also ensure the blkno remains correct
2666          */
2667         memcpy(cblock, block, xfs_btree_block_len(cur));
2668         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2669                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2670                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2671                 else
2672                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2673         }
2674
2675         be16_add_cpu(&block->bb_level, 1);
2676         xfs_btree_set_numrecs(block, 1);
2677         cur->bc_nlevels++;
2678         cur->bc_ptrs[level + 1] = 1;
2679
2680         kp = xfs_btree_key_addr(cur, 1, block);
2681         ckp = xfs_btree_key_addr(cur, 1, cblock);
2682         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2683
2684         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2685 #ifdef DEBUG
2686         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2687                 error = xfs_btree_check_ptr(cur, pp, i, level);
2688                 if (error)
2689                         goto error0;
2690         }
2691 #endif
2692         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2693
2694 #ifdef DEBUG
2695         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2696         if (error)
2697                 goto error0;
2698 #endif
2699         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2700
2701         xfs_iroot_realloc(cur->bc_private.b.ip,
2702                           1 - xfs_btree_get_numrecs(cblock),
2703                           cur->bc_private.b.whichfork);
2704
2705         xfs_btree_setbuf(cur, level, cbp);
2706
2707         /*
2708          * Do all this logging at the end so that
2709          * the root is at the right level.
2710          */
2711         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2712         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2713         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2714
2715         *logflags |=
2716                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2717         *stat = 1;
2718         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2719         return 0;
2720 error0:
2721         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2722         return error;
2723 }
2724
2725 /*
2726  * Allocate a new root block, fill it in.
2727  */
2728 STATIC int                              /* error */
2729 xfs_btree_new_root(
2730         struct xfs_btree_cur    *cur,   /* btree cursor */
2731         int                     *stat)  /* success/failure */
2732 {
2733         struct xfs_btree_block  *block; /* one half of the old root block */
2734         struct xfs_buf          *bp;    /* buffer containing block */
2735         int                     error;  /* error return value */
2736         struct xfs_buf          *lbp;   /* left buffer pointer */
2737         struct xfs_btree_block  *left;  /* left btree block */
2738         struct xfs_buf          *nbp;   /* new (root) buffer */
2739         struct xfs_btree_block  *new;   /* new (root) btree block */
2740         int                     nptr;   /* new value for key index, 1 or 2 */
2741         struct xfs_buf          *rbp;   /* right buffer pointer */
2742         struct xfs_btree_block  *right; /* right btree block */
2743         union xfs_btree_ptr     rptr;
2744         union xfs_btree_ptr     lptr;
2745
2746         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2747         XFS_BTREE_STATS_INC(cur, newroot);
2748
2749         /* initialise our start point from the cursor */
2750         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2751
2752         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2753         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2754         if (error)
2755                 goto error0;
2756         if (*stat == 0)
2757                 goto out0;
2758         XFS_BTREE_STATS_INC(cur, alloc);
2759
2760         /* Set up the new block. */
2761         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2762         if (error)
2763                 goto error0;
2764
2765         /* Set the root in the holding structure  increasing the level by 1. */
2766         cur->bc_ops->set_root(cur, &lptr, 1);
2767
2768         /*
2769          * At the previous root level there are now two blocks: the old root,
2770          * and the new block generated when it was split.  We don't know which
2771          * one the cursor is pointing at, so we set up variables "left" and
2772          * "right" for each case.
2773          */
2774         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2775
2776 #ifdef DEBUG
2777         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2778         if (error)
2779                 goto error0;
2780 #endif
2781
2782         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2783         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2784                 /* Our block is left, pick up the right block. */
2785                 lbp = bp;
2786                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2787                 left = block;
2788                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2789                 if (error)
2790                         goto error0;
2791                 bp = rbp;
2792                 nptr = 1;
2793         } else {
2794                 /* Our block is right, pick up the left block. */
2795                 rbp = bp;
2796                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2797                 right = block;
2798                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2799                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2800                 if (error)
2801                         goto error0;
2802                 bp = lbp;
2803                 nptr = 2;
2804         }
2805         /* Fill in the new block's btree header and log it. */
2806         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2807         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2808         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2809                         !xfs_btree_ptr_is_null(cur, &rptr));
2810
2811         /* Fill in the key data in the new root. */
2812         if (xfs_btree_get_level(left) > 0) {
2813                 xfs_btree_copy_keys(cur,
2814                                 xfs_btree_key_addr(cur, 1, new),
2815                                 xfs_btree_key_addr(cur, 1, left), 1);
2816                 xfs_btree_copy_keys(cur,
2817                                 xfs_btree_key_addr(cur, 2, new),
2818                                 xfs_btree_key_addr(cur, 1, right), 1);
2819         } else {
2820                 cur->bc_ops->init_key_from_rec(
2821                                 xfs_btree_key_addr(cur, 1, new),
2822                                 xfs_btree_rec_addr(cur, 1, left));
2823                 cur->bc_ops->init_key_from_rec(
2824                                 xfs_btree_key_addr(cur, 2, new),
2825                                 xfs_btree_rec_addr(cur, 1, right));
2826         }
2827         xfs_btree_log_keys(cur, nbp, 1, 2);
2828
2829         /* Fill in the pointer data in the new root. */
2830         xfs_btree_copy_ptrs(cur,
2831                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2832         xfs_btree_copy_ptrs(cur,
2833                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2834         xfs_btree_log_ptrs(cur, nbp, 1, 2);
2835
2836         /* Fix up the cursor. */
2837         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2838         cur->bc_ptrs[cur->bc_nlevels] = nptr;
2839         cur->bc_nlevels++;
2840         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2841         *stat = 1;
2842         return 0;
2843 error0:
2844         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2845         return error;
2846 out0:
2847         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2848         *stat = 0;
2849         return 0;
2850 }
2851
2852 STATIC int
2853 xfs_btree_make_block_unfull(
2854         struct xfs_btree_cur    *cur,   /* btree cursor */
2855         int                     level,  /* btree level */
2856         int                     numrecs,/* # of recs in block */
2857         int                     *oindex,/* old tree index */
2858         int                     *index, /* new tree index */
2859         union xfs_btree_ptr     *nptr,  /* new btree ptr */
2860         struct xfs_btree_cur    **ncur, /* new btree cursor */
2861         union xfs_btree_rec     *nrec,  /* new record */
2862         int                     *stat)
2863 {
2864         union xfs_btree_key     key;    /* new btree key value */
2865         int                     error = 0;
2866
2867         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2868             level == cur->bc_nlevels - 1) {
2869                 struct xfs_inode *ip = cur->bc_private.b.ip;
2870
2871                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2872                         /* A root block that can be made bigger. */
2873                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2874                 } else {
2875                         /* A root block that needs replacing */
2876                         int     logflags = 0;
2877
2878                         error = xfs_btree_new_iroot(cur, &logflags, stat);
2879                         if (error || *stat == 0)
2880                                 return error;
2881
2882                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2883                 }
2884
2885                 return 0;
2886         }
2887
2888         /* First, try shifting an entry to the right neighbor. */
2889         error = xfs_btree_rshift(cur, level, stat);
2890         if (error || *stat)
2891                 return error;
2892
2893         /* Next, try shifting an entry to the left neighbor. */
2894         error = xfs_btree_lshift(cur, level, stat);
2895         if (error)
2896                 return error;
2897
2898         if (*stat) {
2899                 *oindex = *index = cur->bc_ptrs[level];
2900                 return 0;
2901         }
2902
2903         /*
2904          * Next, try splitting the current block in half.
2905          *
2906          * If this works we have to re-set our variables because we
2907          * could be in a different block now.
2908          */
2909         error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2910         if (error || *stat == 0)
2911                 return error;
2912
2913
2914         *index = cur->bc_ptrs[level];
2915         cur->bc_ops->init_rec_from_key(&key, nrec);
2916         return 0;
2917 }
2918
2919 /*
2920  * Insert one record/level.  Return information to the caller
2921  * allowing the next level up to proceed if necessary.
2922  */
2923 STATIC int
2924 xfs_btree_insrec(
2925         struct xfs_btree_cur    *cur,   /* btree cursor */
2926         int                     level,  /* level to insert record at */
2927         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2928         union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2929         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2930         int                     *stat)  /* success/failure */
2931 {
2932         struct xfs_btree_block  *block; /* btree block */
2933         struct xfs_buf          *bp;    /* buffer for block */
2934         union xfs_btree_key     key;    /* btree key */
2935         union xfs_btree_ptr     nptr;   /* new block ptr */
2936         struct xfs_btree_cur    *ncur;  /* new btree cursor */
2937         union xfs_btree_rec     nrec;   /* new record count */
2938         int                     optr;   /* old key/record index */
2939         int                     ptr;    /* key/record index */
2940         int                     numrecs;/* number of records */
2941         int                     error;  /* error return value */
2942 #ifdef DEBUG
2943         int                     i;
2944 #endif
2945
2946         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2947         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2948
2949         ncur = NULL;
2950
2951         /*
2952          * If we have an external root pointer, and we've made it to the
2953          * root level, allocate a new root block and we're done.
2954          */
2955         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2956             (level >= cur->bc_nlevels)) {
2957                 error = xfs_btree_new_root(cur, stat);
2958                 xfs_btree_set_ptr_null(cur, ptrp);
2959
2960                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2961                 return error;
2962         }
2963
2964         /* If we're off the left edge, return failure. */
2965         ptr = cur->bc_ptrs[level];
2966         if (ptr == 0) {
2967                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2968                 *stat = 0;
2969                 return 0;
2970         }
2971
2972         /* Make a key out of the record data to be inserted, and save it. */
2973         cur->bc_ops->init_key_from_rec(&key, recp);
2974
2975         optr = ptr;
2976
2977         XFS_BTREE_STATS_INC(cur, insrec);
2978
2979         /* Get pointers to the btree buffer and block. */
2980         block = xfs_btree_get_block(cur, level, &bp);
2981         numrecs = xfs_btree_get_numrecs(block);
2982
2983 #ifdef DEBUG
2984         error = xfs_btree_check_block(cur, block, level, bp);
2985         if (error)
2986                 goto error0;
2987
2988         /* Check that the new entry is being inserted in the right place. */
2989         if (ptr <= numrecs) {
2990                 if (level == 0) {
2991                         ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2992                                 xfs_btree_rec_addr(cur, ptr, block)));
2993                 } else {
2994                         ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2995                                 xfs_btree_key_addr(cur, ptr, block)));
2996                 }
2997         }
2998 #endif
2999
3000         /*
3001          * If the block is full, we can't insert the new entry until we
3002          * make the block un-full.
3003          */
3004         xfs_btree_set_ptr_null(cur, &nptr);
3005         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3006                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3007                                         &optr, &ptr, &nptr, &ncur, &nrec, stat);
3008                 if (error || *stat == 0)
3009                         goto error0;
3010         }
3011
3012         /*
3013          * The current block may have changed if the block was
3014          * previously full and we have just made space in it.
3015          */
3016         block = xfs_btree_get_block(cur, level, &bp);
3017         numrecs = xfs_btree_get_numrecs(block);
3018
3019 #ifdef DEBUG
3020         error = xfs_btree_check_block(cur, block, level, bp);
3021         if (error)
3022                 return error;
3023 #endif
3024
3025         /*
3026          * At this point we know there's room for our new entry in the block
3027          * we're pointing at.
3028          */
3029         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3030
3031         if (level > 0) {
3032                 /* It's a nonleaf. make a hole in the keys and ptrs */
3033                 union xfs_btree_key     *kp;
3034                 union xfs_btree_ptr     *pp;
3035
3036                 kp = xfs_btree_key_addr(cur, ptr, block);
3037                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3038
3039 #ifdef DEBUG
3040                 for (i = numrecs - ptr; i >= 0; i--) {
3041                         error = xfs_btree_check_ptr(cur, pp, i, level);
3042                         if (error)
3043                                 return error;
3044                 }
3045 #endif
3046
3047                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3048                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3049
3050 #ifdef DEBUG
3051                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3052                 if (error)
3053                         goto error0;
3054 #endif
3055
3056                 /* Now put the new data in, bump numrecs and log it. */
3057                 xfs_btree_copy_keys(cur, kp, &key, 1);
3058                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3059                 numrecs++;
3060                 xfs_btree_set_numrecs(block, numrecs);
3061                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3062                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3063 #ifdef DEBUG
3064                 if (ptr < numrecs) {
3065                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3066                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3067                 }
3068 #endif
3069         } else {
3070                 /* It's a leaf. make a hole in the records */
3071                 union xfs_btree_rec             *rp;
3072
3073                 rp = xfs_btree_rec_addr(cur, ptr, block);
3074
3075                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3076
3077                 /* Now put the new data in, bump numrecs and log it. */
3078                 xfs_btree_copy_recs(cur, rp, recp, 1);
3079                 xfs_btree_set_numrecs(block, ++numrecs);
3080                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3081 #ifdef DEBUG
3082                 if (ptr < numrecs) {
3083                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3084                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3085                 }
3086 #endif
3087         }
3088
3089         /* Log the new number of records in the btree header. */
3090         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3091
3092         /* If we inserted at the start of a block, update the parents' keys. */
3093         if (optr == 1) {
3094                 error = xfs_btree_updkey(cur, &key, level + 1);
3095                 if (error)
3096                         goto error0;
3097         }
3098
3099         /*
3100          * If we are tracking the last record in the tree and
3101          * we are at the far right edge of the tree, update it.
3102          */
3103         if (xfs_btree_is_lastrec(cur, block, level)) {
3104                 cur->bc_ops->update_lastrec(cur, block, recp,
3105                                             ptr, LASTREC_INSREC);
3106         }
3107
3108         /*
3109          * Return the new block number, if any.
3110          * If there is one, give back a record value and a cursor too.
3111          */
3112         *ptrp = nptr;
3113         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3114                 *recp = nrec;
3115                 *curp = ncur;
3116         }
3117
3118         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3119         *stat = 1;
3120         return 0;
3121
3122 error0:
3123         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3124         return error;
3125 }
3126
3127 /*
3128  * Insert the record at the point referenced by cur.
3129  *
3130  * A multi-level split of the tree on insert will invalidate the original
3131  * cursor.  All callers of this function should assume that the cursor is
3132  * no longer valid and revalidate it.
3133  */
3134 int
3135 xfs_btree_insert(
3136         struct xfs_btree_cur    *cur,
3137         int                     *stat)
3138 {
3139         int                     error;  /* error return value */
3140         int                     i;      /* result value, 0 for failure */
3141         int                     level;  /* current level number in btree */
3142         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3143         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3144         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3145         union xfs_btree_rec     rec;    /* record to insert */
3146
3147         level = 0;
3148         ncur = NULL;
3149         pcur = cur;
3150
3151         xfs_btree_set_ptr_null(cur, &nptr);
3152         cur->bc_ops->init_rec_from_cur(cur, &rec);
3153
3154         /*
3155          * Loop going up the tree, starting at the leaf level.
3156          * Stop when we don't get a split block, that must mean that
3157          * the insert is finished with this level.
3158          */
3159         do {
3160                 /*
3161                  * Insert nrec/nptr into this level of the tree.
3162                  * Note if we fail, nptr will be null.
3163                  */
3164                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3165                 if (error) {
3166                         if (pcur != cur)
3167                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3168                         goto error0;
3169                 }
3170
3171                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3172                 level++;
3173
3174                 /*
3175                  * See if the cursor we just used is trash.
3176                  * Can't trash the caller's cursor, but otherwise we should
3177                  * if ncur is a new cursor or we're about to be done.
3178                  */
3179                 if (pcur != cur &&
3180                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3181                         /* Save the state from the cursor before we trash it */
3182                         if (cur->bc_ops->update_cursor)
3183                                 cur->bc_ops->update_cursor(pcur, cur);
3184                         cur->bc_nlevels = pcur->bc_nlevels;
3185                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3186                 }
3187                 /* If we got a new cursor, switch to it. */
3188                 if (ncur) {
3189                         pcur = ncur;
3190                         ncur = NULL;
3191                 }
3192         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3193
3194         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3195         *stat = i;
3196         return 0;
3197 error0:
3198         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3199         return error;
3200 }
3201
3202 /*
3203  * Try to merge a non-leaf block back into the inode root.
3204  *
3205  * Note: the killroot names comes from the fact that we're effectively
3206  * killing the old root block.  But because we can't just delete the
3207  * inode we have to copy the single block it was pointing to into the
3208  * inode.
3209  */
3210 STATIC int
3211 xfs_btree_kill_iroot(
3212         struct xfs_btree_cur    *cur)
3213 {
3214         int                     whichfork = cur->bc_private.b.whichfork;
3215         struct xfs_inode        *ip = cur->bc_private.b.ip;
3216         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3217         struct xfs_btree_block  *block;
3218         struct xfs_btree_block  *cblock;
3219         union xfs_btree_key     *kp;
3220         union xfs_btree_key     *ckp;
3221         union xfs_btree_ptr     *pp;
3222         union xfs_btree_ptr     *cpp;
3223         struct xfs_buf          *cbp;
3224         int                     level;
3225         int                     index;
3226         int                     numrecs;
3227         int                     error;
3228 #ifdef DEBUG
3229         union xfs_btree_ptr     ptr;
3230         int                     i;
3231 #endif
3232
3233         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3234
3235         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3236         ASSERT(cur->bc_nlevels > 1);
3237
3238         /*
3239          * Don't deal with the root block needs to be a leaf case.
3240          * We're just going to turn the thing back into extents anyway.
3241          */
3242         level = cur->bc_nlevels - 1;
3243         if (level == 1)
3244                 goto out0;
3245
3246         /*
3247          * Give up if the root has multiple children.
3248          */
3249         block = xfs_btree_get_iroot(cur);
3250         if (xfs_btree_get_numrecs(block) != 1)
3251                 goto out0;
3252
3253         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3254         numrecs = xfs_btree_get_numrecs(cblock);
3255
3256         /*
3257          * Only do this if the next level will fit.
3258          * Then the data must be copied up to the inode,
3259          * instead of freeing the root you free the next level.
3260          */
3261         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3262                 goto out0;
3263
3264         XFS_BTREE_STATS_INC(cur, killroot);
3265
3266 #ifdef DEBUG
3267         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3268         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3269         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3270         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3271 #endif
3272
3273         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3274         if (index) {
3275                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3276                                   cur->bc_private.b.whichfork);
3277                 block = ifp->if_broot;
3278         }
3279
3280         be16_add_cpu(&block->bb_numrecs, index);
3281         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3282
3283         kp = xfs_btree_key_addr(cur, 1, block);
3284         ckp = xfs_btree_key_addr(cur, 1, cblock);
3285         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3286
3287         pp = xfs_btree_ptr_addr(cur, 1, block);
3288         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3289 #ifdef DEBUG
3290         for (i = 0; i < numrecs; i++) {
3291                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3292                 if (error) {
3293                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3294                         return error;
3295                 }
3296         }
3297 #endif
3298         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3299
3300         error = xfs_btree_free_block(cur, cbp);
3301         if (error) {
3302                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3303                 return error;
3304         }
3305
3306         cur->bc_bufs[level - 1] = NULL;
3307         be16_add_cpu(&block->bb_level, -1);
3308         xfs_trans_log_inode(cur->bc_tp, ip,
3309                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3310         cur->bc_nlevels--;
3311 out0:
3312         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3313         return 0;
3314 }
3315
3316 /*
3317  * Kill the current root node, and replace it with it's only child node.
3318  */
3319 STATIC int
3320 xfs_btree_kill_root(
3321         struct xfs_btree_cur    *cur,
3322         struct xfs_buf          *bp,
3323         int                     level,
3324         union xfs_btree_ptr     *newroot)
3325 {
3326         int                     error;
3327
3328         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3329         XFS_BTREE_STATS_INC(cur, killroot);
3330
3331         /*
3332          * Update the root pointer, decreasing the level by 1 and then
3333          * free the old root.
3334          */
3335         cur->bc_ops->set_root(cur, newroot, -1);
3336
3337         error = xfs_btree_free_block(cur, bp);
3338         if (error) {
3339                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3340                 return error;
3341         }
3342
3343         cur->bc_bufs[level] = NULL;
3344         cur->bc_ra[level] = 0;
3345         cur->bc_nlevels--;
3346
3347         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3348         return 0;
3349 }
3350
3351 STATIC int
3352 xfs_btree_dec_cursor(
3353         struct xfs_btree_cur    *cur,
3354         int                     level,
3355         int                     *stat)
3356 {
3357         int                     error;
3358         int                     i;
3359
3360         if (level > 0) {
3361                 error = xfs_btree_decrement(cur, level, &i);
3362                 if (error)
3363                         return error;
3364         }
3365
3366         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3367         *stat = 1;
3368         return 0;
3369 }
3370
3371 /*
3372  * Single level of the btree record deletion routine.
3373  * Delete record pointed to by cur/level.
3374  * Remove the record from its block then rebalance the tree.
3375  * Return 0 for error, 1 for done, 2 to go on to the next level.
3376  */
3377 STATIC int                                      /* error */
3378 xfs_btree_delrec(
3379         struct xfs_btree_cur    *cur,           /* btree cursor */
3380         int                     level,          /* level removing record from */
3381         int                     *stat)          /* fail/done/go-on */
3382 {
3383         struct xfs_btree_block  *block;         /* btree block */
3384         union xfs_btree_ptr     cptr;           /* current block ptr */
3385         struct xfs_buf          *bp;            /* buffer for block */
3386         int                     error;          /* error return value */
3387         int                     i;              /* loop counter */
3388         union xfs_btree_key     key;            /* storage for keyp */
3389         union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3390         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3391         struct xfs_buf          *lbp;           /* left buffer pointer */
3392         struct xfs_btree_block  *left;          /* left btree block */
3393         int                     lrecs = 0;      /* left record count */
3394         int                     ptr;            /* key/record index */
3395         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3396         struct xfs_buf          *rbp;           /* right buffer pointer */
3397         struct xfs_btree_block  *right;         /* right btree block */
3398         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3399         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3400         int                     rrecs = 0;      /* right record count */
3401         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3402         int                     numrecs;        /* temporary numrec count */
3403
3404         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3405         XFS_BTREE_TRACE_ARGI(cur, level);
3406
3407         tcur = NULL;
3408
3409         /* Get the index of the entry being deleted, check for nothing there. */
3410         ptr = cur->bc_ptrs[level];
3411         if (ptr == 0) {
3412                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3413                 *stat = 0;
3414                 return 0;
3415         }
3416
3417         /* Get the buffer & block containing the record or key/ptr. */
3418         block = xfs_btree_get_block(cur, level, &bp);
3419         numrecs = xfs_btree_get_numrecs(block);
3420
3421 #ifdef DEBUG
3422         error = xfs_btree_check_block(cur, block, level, bp);
3423         if (error)
3424                 goto error0;
3425 #endif
3426
3427         /* Fail if we're off the end of the block. */
3428         if (ptr > numrecs) {
3429                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3430                 *stat = 0;
3431                 return 0;
3432         }
3433
3434         XFS_BTREE_STATS_INC(cur, delrec);
3435         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3436
3437         /* Excise the entries being deleted. */
3438         if (level > 0) {
3439                 /* It's a nonleaf. operate on keys and ptrs */
3440                 union xfs_btree_key     *lkp;
3441                 union xfs_btree_ptr     *lpp;
3442
3443                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3444                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3445
3446 #ifdef DEBUG
3447                 for (i = 0; i < numrecs - ptr; i++) {
3448                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3449                         if (error)
3450                                 goto error0;
3451                 }
3452 #endif
3453
3454                 if (ptr < numrecs) {
3455                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3456                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3457                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3458                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3459                 }
3460
3461                 /*
3462                  * If it's the first record in the block, we'll need to pass a
3463                  * key up to the next level (updkey).
3464                  */
3465                 if (ptr == 1)
3466                         keyp = xfs_btree_key_addr(cur, 1, block);
3467         } else {
3468                 /* It's a leaf. operate on records */
3469                 if (ptr < numrecs) {
3470                         xfs_btree_shift_recs(cur,
3471                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3472                                 -1, numrecs - ptr);
3473                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3474                 }
3475
3476                 /*
3477                  * If it's the first record in the block, we'll need a key
3478                  * structure to pass up to the next level (updkey).
3479                  */
3480                 if (ptr == 1) {
3481                         cur->bc_ops->init_key_from_rec(&key,
3482                                         xfs_btree_rec_addr(cur, 1, block));
3483                         keyp = &key;
3484                 }
3485         }
3486
3487         /*
3488          * Decrement and log the number of entries in the block.
3489          */
3490         xfs_btree_set_numrecs(block, --numrecs);
3491         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3492
3493         /*
3494          * If we are tracking the last record in the tree and
3495          * we are at the far right edge of the tree, update it.
3496          */
3497         if (xfs_btree_is_lastrec(cur, block, level)) {
3498                 cur->bc_ops->update_lastrec(cur, block, NULL,
3499                                             ptr, LASTREC_DELREC);
3500         }
3501
3502         /*
3503          * We're at the root level.  First, shrink the root block in-memory.
3504          * Try to get rid of the next level down.  If we can't then there's
3505          * nothing left to do.
3506          */
3507         if (level == cur->bc_nlevels - 1) {
3508                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3509                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3510                                           cur->bc_private.b.whichfork);
3511
3512                         error = xfs_btree_kill_iroot(cur);
3513                         if (error)
3514                                 goto error0;
3515
3516                         error = xfs_btree_dec_cursor(cur, level, stat);
3517                         if (error)
3518                                 goto error0;
3519                         *stat = 1;
3520                         return 0;
3521                 }
3522
3523                 /*
3524                  * If this is the root level, and there's only one entry left,
3525                  * and it's NOT the leaf level, then we can get rid of this
3526                  * level.
3527                  */
3528                 if (numrecs == 1 && level > 0) {
3529                         union xfs_btree_ptr     *pp;
3530                         /*
3531                          * pp is still set to the first pointer in the block.
3532                          * Make it the new root of the btree.
3533                          */
3534                         pp = xfs_btree_ptr_addr(cur, 1, block);
3535                         error = xfs_btree_kill_root(cur, bp, level, pp);
3536                         if (error)
3537                                 goto error0;
3538                 } else if (level > 0) {
3539                         error = xfs_btree_dec_cursor(cur, level, stat);
3540                         if (error)
3541                                 goto error0;
3542                 }
3543                 *stat = 1;
3544                 return 0;
3545         }
3546
3547         /*
3548          * If we deleted the leftmost entry in the block, update the
3549          * key values above us in the tree.
3550          */
3551         if (ptr == 1) {
3552                 error = xfs_btree_updkey(cur, keyp, level + 1);
3553                 if (error)
3554                         goto error0;
3555         }
3556
3557         /*
3558          * If the number of records remaining in the block is at least
3559          * the minimum, we're done.
3560          */
3561         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3562                 error = xfs_btree_dec_cursor(cur, level, stat);
3563                 if (error)
3564                         goto error0;
3565                 return 0;
3566         }
3567
3568         /*
3569          * Otherwise, we have to move some records around to keep the
3570          * tree balanced.  Look at the left and right sibling blocks to
3571          * see if we can re-balance by moving only one record.
3572          */
3573         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3574         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3575
3576         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3577                 /*
3578                  * One child of root, need to get a chance to copy its contents
3579                  * into the root and delete it. Can't go up to next level,
3580                  * there's nothing to delete there.
3581                  */
3582                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3583                     xfs_btree_ptr_is_null(cur, &lptr) &&
3584                     level == cur->bc_nlevels - 2) {
3585                         error = xfs_btree_kill_iroot(cur);
3586                         if (!error)
3587                                 error = xfs_btree_dec_cursor(cur, level, stat);
3588                         if (error)
3589                                 goto error0;
3590                         return 0;
3591                 }
3592         }
3593
3594         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3595                !xfs_btree_ptr_is_null(cur, &lptr));
3596
3597         /*
3598          * Duplicate the cursor so our btree manipulations here won't
3599          * disrupt the next level up.
3600          */
3601         error = xfs_btree_dup_cursor(cur, &tcur);
3602         if (error)
3603                 goto error0;
3604
3605         /*
3606          * If there's a right sibling, see if it's ok to shift an entry
3607          * out of it.
3608          */
3609         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3610                 /*
3611                  * Move the temp cursor to the last entry in the next block.
3612                  * Actually any entry but the first would suffice.
3613                  */
3614                 i = xfs_btree_lastrec(tcur, level);
3615                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3616
3617                 error = xfs_btree_increment(tcur, level, &i);
3618                 if (error)
3619                         goto error0;
3620                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3621
3622                 i = xfs_btree_lastrec(tcur, level);
3623                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3624
3625                 /* Grab a pointer to the block. */
3626                 right = xfs_btree_get_block(tcur, level, &rbp);
3627 #ifdef DEBUG
3628                 error = xfs_btree_check_block(tcur, right, level, rbp);
3629                 if (error)
3630                         goto error0;
3631 #endif
3632                 /* Grab the current block number, for future use. */
3633                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3634
3635                 /*
3636                  * If right block is full enough so that removing one entry
3637                  * won't make it too empty, and left-shifting an entry out
3638                  * of right to us works, we're done.
3639                  */
3640                 if (xfs_btree_get_numrecs(right) - 1 >=
3641                     cur->bc_ops->get_minrecs(tcur, level)) {
3642                         error = xfs_btree_lshift(tcur, level, &i);
3643                         if (error)
3644                                 goto error0;
3645                         if (i) {
3646                                 ASSERT(xfs_btree_get_numrecs(block) >=
3647                                        cur->bc_ops->get_minrecs(tcur, level));
3648
3649                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3650                                 tcur = NULL;
3651
3652                                 error = xfs_btree_dec_cursor(cur, level, stat);
3653                                 if (error)
3654                                         goto error0;
3655                                 return 0;
3656                         }
3657                 }
3658
3659                 /*
3660                  * Otherwise, grab the number of records in right for
3661                  * future reference, and fix up the temp cursor to point
3662                  * to our block again (last record).
3663                  */
3664                 rrecs = xfs_btree_get_numrecs(right);
3665                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3666                         i = xfs_btree_firstrec(tcur, level);
3667                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3668
3669                         error = xfs_btree_decrement(tcur, level, &i);
3670                         if (error)
3671                                 goto error0;
3672                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3673                 }
3674         }
3675
3676         /*
3677          * If there's a left sibling, see if it's ok to shift an entry
3678          * out of it.
3679          */
3680         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3681                 /*
3682                  * Move the temp cursor to the first entry in the
3683                  * previous block.
3684                  */
3685                 i = xfs_btree_firstrec(tcur, level);
3686                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3687
3688                 error = xfs_btree_decrement(tcur, level, &i);
3689                 if (error)
3690                         goto error0;
3691                 i = xfs_btree_firstrec(tcur, level);
3692                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3693
3694                 /* Grab a pointer to the block. */
3695                 left = xfs_btree_get_block(tcur, level, &lbp);
3696 #ifdef DEBUG
3697                 error = xfs_btree_check_block(cur, left, level, lbp);
3698                 if (error)
3699                         goto error0;
3700 #endif
3701                 /* Grab the current block number, for future use. */
3702                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3703
3704                 /*
3705                  * If left block is full enough so that removing one entry
3706                  * won't make it too empty, and right-shifting an entry out
3707                  * of left to us works, we're done.
3708                  */
3709                 if (xfs_btree_get_numrecs(left) - 1 >=
3710                     cur->bc_ops->get_minrecs(tcur, level)) {
3711                         error = xfs_btree_rshift(tcur, level, &i);
3712                         if (error)
3713                                 goto error0;
3714                         if (i) {
3715                                 ASSERT(xfs_btree_get_numrecs(block) >=
3716                                        cur->bc_ops->get_minrecs(tcur, level));
3717                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3718                                 tcur = NULL;
3719                                 if (level == 0)
3720                                         cur->bc_ptrs[0]++;
3721                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3722                                 *stat = 1;
3723                                 return 0;
3724                         }
3725                 }
3726
3727                 /*
3728                  * Otherwise, grab the number of records in right for
3729                  * future reference.
3730                  */
3731                 lrecs = xfs_btree_get_numrecs(left);
3732         }
3733
3734         /* Delete the temp cursor, we're done with it. */
3735         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3736         tcur = NULL;
3737
3738         /* If here, we need to do a join to keep the tree balanced. */
3739         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3740
3741         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3742             lrecs + xfs_btree_get_numrecs(block) <=
3743                         cur->bc_ops->get_maxrecs(cur, level)) {
3744                 /*
3745                  * Set "right" to be the starting block,
3746                  * "left" to be the left neighbor.
3747                  */
3748                 rptr = cptr;
3749                 right = block;
3750                 rbp = bp;
3751                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3752                 if (error)
3753                         goto error0;
3754
3755         /*
3756          * If that won't work, see if we can join with the right neighbor block.
3757          */
3758         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3759                    rrecs + xfs_btree_get_numrecs(block) <=
3760                         cur->bc_ops->get_maxrecs(cur, level)) {
3761                 /*
3762                  * Set "left" to be the starting block,
3763                  * "right" to be the right neighbor.
3764                  */
3765                 lptr = cptr;
3766                 left = block;
3767                 lbp = bp;
3768                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3769                 if (error)
3770                         goto error0;
3771
3772         /*
3773          * Otherwise, we can't fix the imbalance.
3774          * Just return.  This is probably a logic error, but it's not fatal.
3775          */
3776         } else {
3777                 error = xfs_btree_dec_cursor(cur, level, stat);
3778                 if (error)
3779                         goto error0;
3780                 return 0;
3781         }
3782
3783         rrecs = xfs_btree_get_numrecs(right);
3784         lrecs = xfs_btree_get_numrecs(left);
3785
3786         /*
3787          * We're now going to join "left" and "right" by moving all the stuff
3788          * in "right" to "left" and deleting "right".
3789          */
3790         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3791         if (level > 0) {
3792                 /* It's a non-leaf.  Move keys and pointers. */
3793                 union xfs_btree_key     *lkp;   /* left btree key */
3794                 union xfs_btree_ptr     *lpp;   /* left address pointer */
3795                 union xfs_btree_key     *rkp;   /* right btree key */
3796                 union xfs_btree_ptr     *rpp;   /* right address pointer */
3797
3798                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3799                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3800                 rkp = xfs_btree_key_addr(cur, 1, right);
3801                 rpp = xfs_btree_ptr_addr(cur, 1, right);
3802 #ifdef DEBUG
3803                 for (i = 1; i < rrecs; i++) {
3804                         error = xfs_btree_check_ptr(cur, rpp, i, level);
3805                         if (error)
3806                                 goto error0;
3807                 }
3808 #endif
3809                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3810                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3811
3812                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3813                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3814         } else {
3815                 /* It's a leaf.  Move records.  */
3816                 union xfs_btree_rec     *lrp;   /* left record pointer */
3817                 union xfs_btree_rec     *rrp;   /* right record pointer */
3818
3819                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3820                 rrp = xfs_btree_rec_addr(cur, 1, right);
3821
3822                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3823                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3824         }
3825
3826         XFS_BTREE_STATS_INC(cur, join);
3827
3828         /*
3829          * Fix up the number of records and right block pointer in the
3830          * surviving block, and log it.
3831          */
3832         xfs_btree_set_numrecs(left, lrecs + rrecs);
3833         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3834         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3835         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3836
3837         /* If there is a right sibling, point it to the remaining block. */
3838         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3839         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3840                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3841                 if (error)
3842                         goto error0;
3843                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3844                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3845         }
3846
3847         /* Free the deleted block. */
3848         error = xfs_btree_free_block(cur, rbp);
3849         if (error)
3850                 goto error0;
3851
3852         /*
3853          * If we joined with the left neighbor, set the buffer in the
3854          * cursor to the left block, and fix up the index.
3855          */
3856         if (bp != lbp) {
3857                 cur->bc_bufs[level] = lbp;
3858                 cur->bc_ptrs[level] += lrecs;
3859                 cur->bc_ra[level] = 0;
3860         }
3861         /*
3862          * If we joined with the right neighbor and there's a level above
3863          * us, increment the cursor at that level.
3864          */
3865         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3866                    (level + 1 < cur->bc_nlevels)) {
3867                 error = xfs_btree_increment(cur, level + 1, &i);
3868                 if (error)
3869                         goto error0;
3870         }
3871
3872         /*
3873          * Readjust the ptr at this level if it's not a leaf, since it's
3874          * still pointing at the deletion point, which makes the cursor
3875          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3876          * We can't use decrement because it would change the next level up.
3877          */
3878         if (level > 0)
3879                 cur->bc_ptrs[level]--;
3880
3881         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3882         /* Return value means the next level up has something to do. */
3883         *stat = 2;
3884         return 0;
3885
3886 error0:
3887         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3888         if (tcur)
3889                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3890         return error;
3891 }
3892
3893 /*
3894  * Delete the record pointed to by cur.
3895  * The cursor refers to the place where the record was (could be inserted)
3896  * when the operation returns.
3897  */
3898 int                                     /* error */
3899 xfs_btree_delete(
3900         struct xfs_btree_cur    *cur,
3901         int                     *stat)  /* success/failure */
3902 {
3903         int                     error;  /* error return value */
3904         int                     level;
3905         int                     i;
3906
3907         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3908
3909         /*
3910          * Go up the tree, starting at leaf level.
3911          *
3912          * If 2 is returned then a join was done; go to the next level.
3913          * Otherwise we are done.
3914          */
3915         for (level = 0, i = 2; i == 2; level++) {
3916                 error = xfs_btree_delrec(cur, level, &i);
3917                 if (error)
3918                         goto error0;
3919         }
3920
3921         if (i == 0) {
3922                 for (level = 1; level < cur->bc_nlevels; level++) {
3923                         if (cur->bc_ptrs[level] == 0) {
3924                                 error = xfs_btree_decrement(cur, level, &i);
3925                                 if (error)
3926                                         goto error0;
3927                                 break;
3928                         }
3929                 }
3930         }
3931
3932         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3933         *stat = i;
3934         return 0;
3935 error0:
3936         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3937         return error;
3938 }
3939
3940 /*
3941  * Get the data from the pointed-to record.
3942  */
3943 int                                     /* error */
3944 xfs_btree_get_rec(
3945         struct xfs_btree_cur    *cur,   /* btree cursor */
3946         union xfs_btree_rec     **recp, /* output: btree record */
3947         int                     *stat)  /* output: success/failure */
3948 {
3949         struct xfs_btree_block  *block; /* btree block */
3950         struct xfs_buf          *bp;    /* buffer pointer */
3951         int                     ptr;    /* record number */
3952 #ifdef DEBUG
3953         int                     error;  /* error return value */
3954 #endif
3955
3956         ptr = cur->bc_ptrs[0];
3957         block = xfs_btree_get_block(cur, 0, &bp);
3958
3959 #ifdef DEBUG
3960         error = xfs_btree_check_block(cur, block, 0, bp);
3961         if (error)
3962                 return error;
3963 #endif
3964
3965         /*
3966          * Off the right end or left end, return failure.
3967          */
3968         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3969                 *stat = 0;
3970                 return 0;
3971         }
3972
3973         /*
3974          * Point to the record and extract its data.
3975          */
3976         *recp = xfs_btree_rec_addr(cur, ptr, block);
3977         *stat = 1;
3978         return 0;
3979 }
3980
3981 /*
3982  * Change the owner of a btree.
3983  *
3984  * The mechanism we use here is ordered buffer logging. Because we don't know
3985  * how many buffers were are going to need to modify, we don't really want to
3986  * have to make transaction reservations for the worst case of every buffer in a
3987  * full size btree as that may be more space that we can fit in the log....
3988  *
3989  * We do the btree walk in the most optimal manner possible - we have sibling
3990  * pointers so we can just walk all the blocks on each level from left to right
3991  * in a single pass, and then move to the next level and do the same. We can
3992  * also do readahead on the sibling pointers to get IO moving more quickly,
3993  * though for slow disks this is unlikely to make much difference to performance
3994  * as the amount of CPU work we have to do before moving to the next block is
3995  * relatively small.
3996  *
3997  * For each btree block that we load, modify the owner appropriately, set the
3998  * buffer as an ordered buffer and log it appropriately. We need to ensure that
3999  * we mark the region we change dirty so that if the buffer is relogged in
4000  * a subsequent transaction the changes we make here as an ordered buffer are
4001  * correctly relogged in that transaction.  If we are in recovery context, then
4002  * just queue the modified buffer as delayed write buffer so the transaction
4003  * recovery completion writes the changes to disk.
4004  */
4005 static int
4006 xfs_btree_block_change_owner(
4007         struct xfs_btree_cur    *cur,
4008         int                     level,
4009         __uint64_t              new_owner,
4010         struct list_head        *buffer_list)
4011 {
4012         struct xfs_btree_block  *block;
4013         struct xfs_buf          *bp;
4014         union xfs_btree_ptr     rptr;
4015
4016         /* do right sibling readahead */
4017         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4018
4019         /* modify the owner */
4020         block = xfs_btree_get_block(cur, level, &bp);
4021         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4022                 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
4023         else
4024                 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
4025
4026         /*
4027          * If the block is a root block hosted in an inode, we might not have a
4028          * buffer pointer here and we shouldn't attempt to log the change as the
4029          * information is already held in the inode and discarded when the root
4030          * block is formatted into the on-disk inode fork. We still change it,
4031          * though, so everything is consistent in memory.
4032          */
4033         if (bp) {
4034                 if (cur->bc_tp) {
4035                         xfs_trans_ordered_buf(cur->bc_tp, bp);
4036                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4037                 } else {
4038                         xfs_buf_delwri_queue(bp, buffer_list);
4039                 }
4040         } else {
4041                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4042                 ASSERT(level == cur->bc_nlevels - 1);
4043         }
4044
4045         /* now read rh sibling block for next iteration */
4046         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4047         if (xfs_btree_ptr_is_null(cur, &rptr))
4048                 return -ENOENT;
4049
4050         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4051 }
4052
4053 int
4054 xfs_btree_change_owner(
4055         struct xfs_btree_cur    *cur,
4056         __uint64_t              new_owner,
4057         struct list_head        *buffer_list)
4058 {
4059         union xfs_btree_ptr     lptr;
4060         int                     level;
4061         struct xfs_btree_block  *block = NULL;
4062         int                     error = 0;
4063
4064         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4065
4066         /* for each level */
4067         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4068                 /* grab the left hand block */
4069                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4070                 if (error)
4071                         return error;
4072
4073                 /* readahead the left most block for the next level down */
4074                 if (level > 0) {
4075                         union xfs_btree_ptr     *ptr;
4076
4077                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4078                         xfs_btree_readahead_ptr(cur, ptr, 1);
4079
4080                         /* save for the next iteration of the loop */
4081                         lptr = *ptr;
4082                 }
4083
4084                 /* for each buffer in the level */
4085                 do {
4086                         error = xfs_btree_block_change_owner(cur, level,
4087                                                              new_owner,
4088                                                              buffer_list);
4089                 } while (!error);
4090
4091                 if (error != -ENOENT)
4092                         return error;
4093         }
4094
4095         return 0;
4096 }
4097
4098 /**
4099  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4100  *                                    btree block
4101  *
4102  * @bp: buffer containing the btree block
4103  * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4104  * @pag_max_level: pointer to the per-ag max level field
4105  */
4106 bool
4107 xfs_btree_sblock_v5hdr_verify(
4108         struct xfs_buf          *bp)
4109 {
4110         struct xfs_mount        *mp = bp->b_target->bt_mount;
4111         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4112         struct xfs_perag        *pag = bp->b_pag;
4113
4114         if (!xfs_sb_version_hascrc(&mp->m_sb))
4115                 return false;
4116         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4117                 return false;
4118         if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4119                 return false;
4120         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4121                 return false;
4122         return true;
4123 }
4124
4125 /**
4126  * xfs_btree_sblock_verify() -- verify a short-format btree block
4127  *
4128  * @bp: buffer containing the btree block
4129  * @max_recs: maximum records allowed in this btree node
4130  */
4131 bool
4132 xfs_btree_sblock_verify(
4133         struct xfs_buf          *bp,
4134         unsigned int            max_recs)
4135 {
4136         struct xfs_mount        *mp = bp->b_target->bt_mount;
4137         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4138
4139         /* numrecs verification */
4140         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4141                 return false;
4142
4143         /* sibling pointer verification */
4144         if (!block->bb_u.s.bb_leftsib ||
4145             (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
4146              block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
4147                 return false;
4148         if (!block->bb_u.s.bb_rightsib ||
4149             (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
4150              block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
4151                 return false;
4152
4153         return true;
4154 }
4155
4156 /*
4157  * Calculate the number of btree levels needed to store a given number of
4158  * records in a short-format btree.
4159  */
4160 uint
4161 xfs_btree_compute_maxlevels(
4162         struct xfs_mount        *mp,
4163         uint                    *limits,
4164         unsigned long           len)
4165 {
4166         uint                    level;
4167         unsigned long           maxblocks;
4168
4169         maxblocks = (len + limits[0] - 1) / limits[0];
4170         for (level = 1; maxblocks > 1; level++)
4171                 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4172         return level;
4173 }