Merge tag 'cramfs_fixes' of git://git.linaro.org/people/nicolas.pitre/linux
[sfrench/cifs-2.6.git] / arch / arm64 / crypto / aes-modes.S
index 483a7130cf0e118de591837a067c4a489ca12a5e..67700045a0e0f7b0f70bb42e9bc50cf71b43c17e 100644 (file)
        .align          4
 
 aes_encrypt_block4x:
-       encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
+       encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
 ENDPROC(aes_encrypt_block4x)
 
 aes_decrypt_block4x:
-       decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
+       decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
 ENDPROC(aes_decrypt_block4x)
 
@@ -31,71 +31,57 @@ ENDPROC(aes_decrypt_block4x)
         */
 
 AES_ENTRY(aes_ecb_encrypt)
-       frame_push      5
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-
-.Lecbencrestart:
-       enc_prepare     w22, x21, x5
+       enc_prepare     w3, x2, x5
 
 .LecbencloopNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lecbenc1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 pt blocks */
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
        bl              aes_encrypt_block4x
-       st1             {v0.16b-v3.16b}, [x19], #64
-       cond_yield_neon .Lecbencrestart
+       st1             {v0.16b-v3.16b}, [x0], #64
        b               .LecbencloopNx
 .Lecbenc1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lecbencout
 .Lecbencloop:
-       ld1             {v0.16b}, [x20], #16            /* get next pt block */
-       encrypt_block   v0, w22, x21, x5, w6
-       st1             {v0.16b}, [x19], #16
-       subs            w23, w23, #1
+       ld1             {v0.16b}, [x1], #16             /* get next pt block */
+       encrypt_block   v0, w3, x2, x5, w6
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
        bne             .Lecbencloop
 .Lecbencout:
-       frame_pop
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_encrypt)
 
 
 AES_ENTRY(aes_ecb_decrypt)
-       frame_push      5
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-
-.Lecbdecrestart:
-       dec_prepare     w22, x21, x5
+       dec_prepare     w3, x2, x5
 
 .LecbdecloopNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lecbdec1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 ct blocks */
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        bl              aes_decrypt_block4x
-       st1             {v0.16b-v3.16b}, [x19], #64
-       cond_yield_neon .Lecbdecrestart
+       st1             {v0.16b-v3.16b}, [x0], #64
        b               .LecbdecloopNx
 .Lecbdec1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lecbdecout
 .Lecbdecloop:
-       ld1             {v0.16b}, [x20], #16            /* get next ct block */
-       decrypt_block   v0, w22, x21, x5, w6
-       st1             {v0.16b}, [x19], #16
-       subs            w23, w23, #1
+       ld1             {v0.16b}, [x1], #16             /* get next ct block */
+       decrypt_block   v0, w3, x2, x5, w6
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
        bne             .Lecbdecloop
 .Lecbdecout:
-       frame_pop
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_decrypt)
 
@@ -108,162 +94,211 @@ AES_ENDPROC(aes_ecb_decrypt)
         */
 
 AES_ENTRY(aes_cbc_encrypt)
-       frame_push      6
-
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x5
-
-.Lcbcencrestart:
-       ld1             {v4.16b}, [x24]                 /* get iv */
-       enc_prepare     w22, x21, x6
+       ld1             {v4.16b}, [x5]                  /* get iv */
+       enc_prepare     w3, x2, x6
 
 .Lcbcencloop4x:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lcbcenc1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 pt blocks */
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
        eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
-       encrypt_block   v0, w22, x21, x6, w7
+       encrypt_block   v0, w3, x2, x6, w7
        eor             v1.16b, v1.16b, v0.16b
-       encrypt_block   v1, w22, x21, x6, w7
+       encrypt_block   v1, w3, x2, x6, w7
        eor             v2.16b, v2.16b, v1.16b
-       encrypt_block   v2, w22, x21, x6, w7
+       encrypt_block   v2, w3, x2, x6, w7
        eor             v3.16b, v3.16b, v2.16b
-       encrypt_block   v3, w22, x21, x6, w7
-       st1             {v0.16b-v3.16b}, [x19], #64
+       encrypt_block   v3, w3, x2, x6, w7
+       st1             {v0.16b-v3.16b}, [x0], #64
        mov             v4.16b, v3.16b
-       st1             {v4.16b}, [x24]                 /* return iv */
-       cond_yield_neon .Lcbcencrestart
        b               .Lcbcencloop4x
 .Lcbcenc1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lcbcencout
 .Lcbcencloop:
-       ld1             {v0.16b}, [x20], #16            /* get next pt block */
+       ld1             {v0.16b}, [x1], #16             /* get next pt block */
        eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
-       encrypt_block   v4, w22, x21, x6, w7
-       st1             {v4.16b}, [x19], #16
-       subs            w23, w23, #1
+       encrypt_block   v4, w3, x2, x6, w7
+       st1             {v4.16b}, [x0], #16
+       subs            w4, w4, #1
        bne             .Lcbcencloop
 .Lcbcencout:
-       st1             {v4.16b}, [x24]                 /* return iv */
-       frame_pop
+       st1             {v4.16b}, [x5]                  /* return iv */
        ret
 AES_ENDPROC(aes_cbc_encrypt)
 
 
 AES_ENTRY(aes_cbc_decrypt)
-       frame_push      6
-
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x5
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-.Lcbcdecrestart:
-       ld1             {v7.16b}, [x24]                 /* get iv */
-       dec_prepare     w22, x21, x6
+       ld1             {v7.16b}, [x5]                  /* get iv */
+       dec_prepare     w3, x2, x6
 
 .LcbcdecloopNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lcbcdec1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 ct blocks */
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        mov             v4.16b, v0.16b
        mov             v5.16b, v1.16b
        mov             v6.16b, v2.16b
        bl              aes_decrypt_block4x
-       sub             x20, x20, #16
+       sub             x1, x1, #16
        eor             v0.16b, v0.16b, v7.16b
        eor             v1.16b, v1.16b, v4.16b
-       ld1             {v7.16b}, [x20], #16            /* reload 1 ct block */
+       ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
        eor             v2.16b, v2.16b, v5.16b
        eor             v3.16b, v3.16b, v6.16b
-       st1             {v0.16b-v3.16b}, [x19], #64
-       st1             {v7.16b}, [x24]                 /* return iv */
-       cond_yield_neon .Lcbcdecrestart
+       st1             {v0.16b-v3.16b}, [x0], #64
        b               .LcbcdecloopNx
 .Lcbcdec1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lcbcdecout
 .Lcbcdecloop:
-       ld1             {v1.16b}, [x20], #16            /* get next ct block */
+       ld1             {v1.16b}, [x1], #16             /* get next ct block */
        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
-       decrypt_block   v0, w22, x21, x6, w7
+       decrypt_block   v0, w3, x2, x6, w7
        eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
        mov             v7.16b, v1.16b                  /* ct is next iv */
-       st1             {v0.16b}, [x19], #16
-       subs            w23, w23, #1
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
        bne             .Lcbcdecloop
 .Lcbcdecout:
-       st1             {v7.16b}, [x24]                 /* return iv */
-       frame_pop
+       st1             {v7.16b}, [x5]                  /* return iv */
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_cbc_decrypt)
 
 
+       /*
+        * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+        *                     int rounds, int bytes, u8 const iv[])
+        * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+        *                     int rounds, int bytes, u8 const iv[])
+        */
+
+AES_ENTRY(aes_cbc_cts_encrypt)
+       adr_l           x8, .Lcts_permute_table
+       sub             x4, x4, #16
+       add             x9, x8, #32
+       add             x8, x8, x4
+       sub             x9, x9, x4
+       ld1             {v3.16b}, [x8]
+       ld1             {v4.16b}, [x9]
+
+       ld1             {v0.16b}, [x1], x4              /* overlapping loads */
+       ld1             {v1.16b}, [x1]
+
+       ld1             {v5.16b}, [x5]                  /* get iv */
+       enc_prepare     w3, x2, x6
+
+       eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
+       tbl             v1.16b, {v1.16b}, v4.16b
+       encrypt_block   v0, w3, x2, x6, w7
+
+       eor             v1.16b, v1.16b, v0.16b
+       tbl             v0.16b, {v0.16b}, v3.16b
+       encrypt_block   v1, w3, x2, x6, w7
+
+       add             x4, x0, x4
+       st1             {v0.16b}, [x4]                  /* overlapping stores */
+       st1             {v1.16b}, [x0]
+       ret
+AES_ENDPROC(aes_cbc_cts_encrypt)
+
+AES_ENTRY(aes_cbc_cts_decrypt)
+       adr_l           x8, .Lcts_permute_table
+       sub             x4, x4, #16
+       add             x9, x8, #32
+       add             x8, x8, x4
+       sub             x9, x9, x4
+       ld1             {v3.16b}, [x8]
+       ld1             {v4.16b}, [x9]
+
+       ld1             {v0.16b}, [x1], x4              /* overlapping loads */
+       ld1             {v1.16b}, [x1]
+
+       ld1             {v5.16b}, [x5]                  /* get iv */
+       dec_prepare     w3, x2, x6
+
+       tbl             v2.16b, {v1.16b}, v4.16b
+       decrypt_block   v0, w3, x2, x6, w7
+       eor             v2.16b, v2.16b, v0.16b
+
+       tbx             v0.16b, {v1.16b}, v4.16b
+       tbl             v2.16b, {v2.16b}, v3.16b
+       decrypt_block   v0, w3, x2, x6, w7
+       eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
+
+       add             x4, x0, x4
+       st1             {v2.16b}, [x4]                  /* overlapping stores */
+       st1             {v0.16b}, [x0]
+       ret
+AES_ENDPROC(aes_cbc_cts_decrypt)
+
+       .section        ".rodata", "a"
+       .align          6
+.Lcts_permute_table:
+       .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+       .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+       .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+       .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
+       .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+       .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+       .previous
+
+
        /*
         * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
         *                 int blocks, u8 ctr[])
         */
 
 AES_ENTRY(aes_ctr_encrypt)
-       frame_push      6
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x5
-
-.Lctrrestart:
-       enc_prepare     w22, x21, x6
-       ld1             {v4.16b}, [x24]
+       enc_prepare     w3, x2, x6
+       ld1             {v4.16b}, [x5]
 
        umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
        rev             x6, x6
+       cmn             w6, w4                  /* 32 bit overflow? */
+       bcs             .Lctrloop
 .LctrloopNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lctr1x
-       cmn             w6, #4                  /* 32 bit overflow? */
-       bcs             .Lctr1x
-       ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
-       dup             v7.4s, w6
+       add             w7, w6, #1
        mov             v0.16b, v4.16b
-       add             v7.4s, v7.4s, v8.4s
+       add             w8, w6, #2
        mov             v1.16b, v4.16b
-       rev32           v8.16b, v7.16b
+       add             w9, w6, #3
        mov             v2.16b, v4.16b
+       rev             w7, w7
        mov             v3.16b, v4.16b
-       mov             v1.s[3], v8.s[0]
-       mov             v2.s[3], v8.s[1]
-       mov             v3.s[3], v8.s[2]
-       ld1             {v5.16b-v7.16b}, [x20], #48     /* get 3 input blocks */
+       rev             w8, w8
+       mov             v1.s[3], w7
+       rev             w9, w9
+       mov             v2.s[3], w8
+       mov             v3.s[3], w9
+       ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
        bl              aes_encrypt_block4x
        eor             v0.16b, v5.16b, v0.16b
-       ld1             {v5.16b}, [x20], #16            /* get 1 input block  */
+       ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
        eor             v1.16b, v6.16b, v1.16b
        eor             v2.16b, v7.16b, v2.16b
        eor             v3.16b, v5.16b, v3.16b
-       st1             {v0.16b-v3.16b}, [x19], #64
+       st1             {v0.16b-v3.16b}, [x0], #64
        add             x6, x6, #4
        rev             x7, x6
        ins             v4.d[1], x7
-       cbz             w23, .Lctrout
-       st1             {v4.16b}, [x24]         /* return next CTR value */
-       cond_yield_neon .Lctrrestart
+       cbz             w4, .Lctrout
        b               .LctrloopNx
 .Lctr1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lctrout
 .Lctrloop:
        mov             v0.16b, v4.16b
-       encrypt_block   v0, w22, x21, x8, w7
+       encrypt_block   v0, w3, x2, x8, w7
 
        adds            x6, x6, #1              /* increment BE ctr */
        rev             x7, x6
@@ -271,22 +306,22 @@ AES_ENTRY(aes_ctr_encrypt)
        bcs             .Lctrcarry              /* overflow? */
 
 .Lctrcarrydone:
-       subs            w23, w23, #1
+       subs            w4, w4, #1
        bmi             .Lctrtailblock          /* blocks <0 means tail block */
-       ld1             {v3.16b}, [x20], #16
+       ld1             {v3.16b}, [x1], #16
        eor             v3.16b, v0.16b, v3.16b
-       st1             {v3.16b}, [x19], #16
+       st1             {v3.16b}, [x0], #16
        bne             .Lctrloop
 
 .Lctrout:
-       st1             {v4.16b}, [x24]         /* return next CTR value */
-.Lctrret:
-       frame_pop
+       st1             {v4.16b}, [x5]          /* return next CTR value */
+       ldp             x29, x30, [sp], #16
        ret
 
 .Lctrtailblock:
-       st1             {v0.16b}, [x19]
-       b               .Lctrret
+       st1             {v0.16b}, [x0]
+       ldp             x29, x30, [sp], #16
+       ret
 
 .Lctrcarry:
        umov            x7, v4.d[0]             /* load upper word of ctr  */
@@ -296,7 +331,6 @@ AES_ENTRY(aes_ctr_encrypt)
        ins             v4.d[0], x7
        b               .Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
-       .ltorg
 
 
        /*
@@ -306,150 +340,132 @@ AES_ENDPROC(aes_ctr_encrypt)
         *                 int blocks, u8 const rk2[], u8 iv[], int first)
         */
 
-       .macro          next_tweak, out, in, const, tmp
+       .macro          next_tweak, out, in, tmp
        sshr            \tmp\().2d,  \in\().2d,   #63
-       and             \tmp\().16b, \tmp\().16b, \const\().16b
+       and             \tmp\().16b, \tmp\().16b, xtsmask.16b
        add             \out\().2d,  \in\().2d,   \in\().2d
        ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
        eor             \out\().16b, \out\().16b, \tmp\().16b
        .endm
 
-.Lxts_mul_x:
-CPU_LE(        .quad           1, 0x87         )
-CPU_BE(        .quad           0x87, 1         )
+       .macro          xts_load_mask, tmp
+       movi            xtsmask.2s, #0x1
+       movi            \tmp\().2s, #0x87
+       uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
+       .endm
 
 AES_ENTRY(aes_xts_encrypt)
-       frame_push      6
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x6
-
-       ld1             {v4.16b}, [x24]
+       ld1             {v4.16b}, [x6]
+       xts_load_mask   v8
        cbz             w7, .Lxtsencnotfirst
 
        enc_prepare     w3, x5, x8
        encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
        enc_switch_key  w3, x2, x8
-       ldr             q7, .Lxts_mul_x
        b               .LxtsencNx
 
-.Lxtsencrestart:
-       ld1             {v4.16b}, [x24]
 .Lxtsencnotfirst:
-       enc_prepare     w22, x21, x8
+       enc_prepare     w3, x2, x8
 .LxtsencloopNx:
-       ldr             q7, .Lxts_mul_x
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
 .LxtsencNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lxtsenc1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 pt blocks */
-       next_tweak      v5, v4, v7, v8
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+       next_tweak      v5, v4, v8
        eor             v0.16b, v0.16b, v4.16b
-       next_tweak      v6, v5, v7, v8
+       next_tweak      v6, v5, v8
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       next_tweak      v7, v6, v7, v8
+       next_tweak      v7, v6, v8
        eor             v3.16b, v3.16b, v7.16b
        bl              aes_encrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       st1             {v0.16b-v3.16b}, [x19], #64
+       st1             {v0.16b-v3.16b}, [x0], #64
        mov             v4.16b, v7.16b
-       cbz             w23, .Lxtsencout
-       st1             {v4.16b}, [x24]
-       cond_yield_neon .Lxtsencrestart
+       cbz             w4, .Lxtsencout
+       xts_reload_mask v8
        b               .LxtsencloopNx
 .Lxtsenc1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lxtsencout
 .Lxtsencloop:
-       ld1             {v1.16b}, [x20], #16
+       ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
-       encrypt_block   v0, w22, x21, x8, w7
+       encrypt_block   v0, w3, x2, x8, w7
        eor             v0.16b, v0.16b, v4.16b
-       st1             {v0.16b}, [x19], #16
-       subs            w23, w23, #1
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
        beq             .Lxtsencout
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
        b               .Lxtsencloop
 .Lxtsencout:
-       st1             {v4.16b}, [x24]
-       frame_pop
+       st1             {v4.16b}, [x6]
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_encrypt)
 
 
 AES_ENTRY(aes_xts_decrypt)
-       frame_push      6
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x6
-
-       ld1             {v4.16b}, [x24]
+       ld1             {v4.16b}, [x6]
+       xts_load_mask   v8
        cbz             w7, .Lxtsdecnotfirst
 
        enc_prepare     w3, x5, x8
        encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
        dec_prepare     w3, x2, x8
-       ldr             q7, .Lxts_mul_x
        b               .LxtsdecNx
 
-.Lxtsdecrestart:
-       ld1             {v4.16b}, [x24]
 .Lxtsdecnotfirst:
-       dec_prepare     w22, x21, x8
+       dec_prepare     w3, x2, x8
 .LxtsdecloopNx:
-       ldr             q7, .Lxts_mul_x
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
 .LxtsdecNx:
-       subs            w23, w23, #4
+       subs            w4, w4, #4
        bmi             .Lxtsdec1x
-       ld1             {v0.16b-v3.16b}, [x20], #64     /* get 4 ct blocks */
-       next_tweak      v5, v4, v7, v8
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+       next_tweak      v5, v4, v8
        eor             v0.16b, v0.16b, v4.16b
-       next_tweak      v6, v5, v7, v8
+       next_tweak      v6, v5, v8
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       next_tweak      v7, v6, v7, v8
+       next_tweak      v7, v6, v8
        eor             v3.16b, v3.16b, v7.16b
        bl              aes_decrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
        eor             v2.16b, v2.16b, v6.16b
-       st1             {v0.16b-v3.16b}, [x19], #64
+       st1             {v0.16b-v3.16b}, [x0], #64
        mov             v4.16b, v7.16b
-       cbz             w23, .Lxtsdecout
-       st1             {v4.16b}, [x24]
-       cond_yield_neon .Lxtsdecrestart
+       cbz             w4, .Lxtsdecout
+       xts_reload_mask v8
        b               .LxtsdecloopNx
 .Lxtsdec1x:
-       adds            w23, w23, #4
+       adds            w4, w4, #4
        beq             .Lxtsdecout
 .Lxtsdecloop:
-       ld1             {v1.16b}, [x20], #16
+       ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
-       decrypt_block   v0, w22, x21, x8, w7
+       decrypt_block   v0, w3, x2, x8, w7
        eor             v0.16b, v0.16b, v4.16b
-       st1             {v0.16b}, [x19], #16
-       subs            w23, w23, #1
+       st1             {v0.16b}, [x0], #16
+       subs            w4, w4, #1
        beq             .Lxtsdecout
-       next_tweak      v4, v4, v7, v8
+       next_tweak      v4, v4, v8
        b               .Lxtsdecloop
 .Lxtsdecout:
-       st1             {v4.16b}, [x24]
-       frame_pop
+       st1             {v4.16b}, [x6]
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_decrypt)