arch/arm/crypto/speck-neon-core.S

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
   4  *
   5  * Copyright (c) 2018 Google, Inc
   6  *
   7  * Author: Eric Biggers <ebiggers@google.com>
   8  */
   9
  10 #include <linux/linkage.h>
  11
  12         .text
  13         .fpu            neon
  14
  15         // arguments
  16         ROUND_KEYS      .req    r0      // const {u64,u32} *round_keys
  17         NROUNDS         .req    r1      // int nrounds
  18         DST             .req    r2      // void *dst
  19         SRC             .req    r3      // const void *src
  20         NBYTES          .req    r4      // unsigned int nbytes
  21         TWEAK           .req    r5      // void *tweak
  22
  23         // registers which hold the data being encrypted/decrypted
  24         X0              .req    q0
  25         X0_L            .req    d0
  26         X0_H            .req    d1
  27         Y0              .req    q1
  28         Y0_H            .req    d3
  29         X1              .req    q2
  30         X1_L            .req    d4
  31         X1_H            .req    d5
  32         Y1              .req    q3
  33         Y1_H            .req    d7
  34         X2              .req    q4
  35         X2_L            .req    d8
  36         X2_H            .req    d9
  37         Y2              .req    q5
  38         Y2_H            .req    d11
  39         X3              .req    q6
  40         X3_L            .req    d12
  41         X3_H            .req    d13
  42         Y3              .req    q7
  43         Y3_H            .req    d15
  44
  45         // the round key, duplicated in all lanes
  46         ROUND_KEY       .req    q8
  47         ROUND_KEY_L     .req    d16
  48         ROUND_KEY_H     .req    d17
  49
  50         // index vector for vtbl-based 8-bit rotates
  51         ROTATE_TABLE    .req    d18
  52
  53         // multiplication table for updating XTS tweaks
  54         GF128MUL_TABLE  .req    d19
  55         GF64MUL_TABLE   .req    d19
  56
  57         // current XTS tweak value(s)
  58         TWEAKV          .req    q10
  59         TWEAKV_L        .req    d20
  60         TWEAKV_H        .req    d21
  61
  62         TMP0            .req    q12
  63         TMP0_L          .req    d24
  64         TMP0_H          .req    d25
  65         TMP1            .req    q13
  66         TMP2            .req    q14
  67         TMP3            .req    q15
  68
  69         .align          4
  70 .Lror64_8_table:
  71         .byte           1, 2, 3, 4, 5, 6, 7, 0
  72 .Lror32_8_table:
  73         .byte           1, 2, 3, 0, 5, 6, 7, 4
  74 .Lrol64_8_table:
  75         .byte           7, 0, 1, 2, 3, 4, 5, 6
  76 .Lrol32_8_table:
  77         .byte           3, 0, 1, 2, 7, 4, 5, 6
  78 .Lgf128mul_table:
  79         .byte           0, 0x87
  80         .fill           14
  81 .Lgf64mul_table:
  82         .byte           0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
  83         .fill           12
  84
  85 /*
  86  * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
  87  *
  88  * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
  89  * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
  90  * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
  91  *
  92  * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
  93  * the vtbl approach is faster on some processors and the same speed on others.
  94  */
  95 .macro _speck_round_128bytes    n
  96
  97         // x = ror(x, 8)
  98         vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
  99         vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
 100         vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
 101         vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
 102         vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
 103         vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
 104         vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
 105         vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
 106
 107         // x += y
 108         vadd.u\n        X0, Y0
 109         vadd.u\n        X1, Y1
 110         vadd.u\n        X2, Y2
 111         vadd.u\n        X3, Y3
 112
 113         // x ^= k
 114         veor            X0, ROUND_KEY
 115         veor            X1, ROUND_KEY
 116         veor            X2, ROUND_KEY
 117         veor            X3, ROUND_KEY
 118
 119         // y = rol(y, 3)
 120         vshl.u\n        TMP0, Y0, #3
 121         vshl.u\n        TMP1, Y1, #3
 122         vshl.u\n        TMP2, Y2, #3
 123         vshl.u\n        TMP3, Y3, #3
 124         vsri.u\n        TMP0, Y0, #(\n - 3)
 125         vsri.u\n        TMP1, Y1, #(\n - 3)
 126         vsri.u\n        TMP2, Y2, #(\n - 3)
 127         vsri.u\n        TMP3, Y3, #(\n - 3)
 128
 129         // y ^= x
 130         veor            Y0, TMP0, X0
 131         veor            Y1, TMP1, X1
 132         veor            Y2, TMP2, X2
 133         veor            Y3, TMP3, X3
 134 .endm
 135
 136 /*
 137  * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
 138  *
 139  * This is the inverse of _speck_round_128bytes().
 140  */
 141 .macro _speck_unround_128bytes  n
 142
 143         // y ^= x
 144         veor            TMP0, Y0, X0
 145         veor            TMP1, Y1, X1
 146         veor            TMP2, Y2, X2
 147         veor            TMP3, Y3, X3
 148
 149         // y = ror(y, 3)
 150         vshr.u\n        Y0, TMP0, #3
 151         vshr.u\n        Y1, TMP1, #3
 152         vshr.u\n        Y2, TMP2, #3
 153         vshr.u\n        Y3, TMP3, #3
 154         vsli.u\n        Y0, TMP0, #(\n - 3)
 155         vsli.u\n        Y1, TMP1, #(\n - 3)
 156         vsli.u\n        Y2, TMP2, #(\n - 3)
 157         vsli.u\n        Y3, TMP3, #(\n - 3)
 158
 159         // x ^= k
 160         veor            X0, ROUND_KEY
 161         veor            X1, ROUND_KEY
 162         veor            X2, ROUND_KEY
 163         veor            X3, ROUND_KEY
 164
 165         // x -= y
 166         vsub.u\n        X0, Y0
 167         vsub.u\n        X1, Y1
 168         vsub.u\n        X2, Y2
 169         vsub.u\n        X3, Y3
 170
 171         // x = rol(x, 8);
 172         vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
 173         vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
 174         vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
 175         vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
 176         vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
 177         vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
 178         vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
 179         vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
 180 .endm
 181
 182 .macro _xts128_precrypt_one     dst_reg, tweak_buf, tmp
 183
 184         // Load the next source block
 185         vld1.8          {\dst_reg}, [SRC]!
 186
 187         // Save the current tweak in the tweak buffer
 188         vst1.8          {TWEAKV}, [\tweak_buf:128]!
 189
 190         // XOR the next source block with the current tweak
 191         veor            \dst_reg, TWEAKV
 192
 193         /*
 194          * Calculate the next tweak by multiplying the current one by x,
 195          * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
 196          */
 197         vshr.u64        \tmp, TWEAKV, #63
 198         vshl.u64        TWEAKV, #1
 199         veor            TWEAKV_H, \tmp\()_L
 200         vtbl.8          \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
 201         veor            TWEAKV_L, \tmp\()_H
 202 .endm
 203
 204 .macro _xts64_precrypt_two      dst_reg, tweak_buf, tmp
 205
 206         // Load the next two source blocks
 207         vld1.8          {\dst_reg}, [SRC]!
 208
 209         // Save the current two tweaks in the tweak buffer
 210         vst1.8          {TWEAKV}, [\tweak_buf:128]!
 211
 212         // XOR the next two source blocks with the current two tweaks
 213         veor            \dst_reg, TWEAKV
 214
 215         /*
 216          * Calculate the next two tweaks by multiplying the current ones by x^2,
 217          * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
 218          */
 219         vshr.u64        \tmp, TWEAKV, #62
 220         vshl.u64        TWEAKV, #2
 221         vtbl.8          \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
 222         vtbl.8          \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
 223         veor            TWEAKV, \tmp
 224 .endm
 225
 226 /*
 227  * _speck_xts_crypt() - Speck-XTS encryption/decryption
 228  *
 229  * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
 230  * using Speck-XTS, specifically the variant with a block size of '2n' and round
 231  * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
 232  * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
 233  * nonzero multiple of 128.
 234  */
 235 .macro _speck_xts_crypt n, decrypting
 236         push            {r4-r7}
 237         mov             r7, sp
 238
 239         /*
 240          * The first four parameters were passed in registers r0-r3.  Load the
 241          * additional parameters, which were passed on the stack.
 242          */
 243         ldr             NBYTES, [sp, #16]
 244         ldr             TWEAK, [sp, #20]
 245
 246         /*
 247          * If decrypting, modify the ROUND_KEYS parameter to point to the last
 248          * round key rather than the first, since for decryption the round keys
 249          * are used in reverse order.
 250          */
 251 .if \decrypting
 252 .if \n == 64
 253         add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
 254         sub             ROUND_KEYS, #8
 255 .else
 256         add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
 257         sub             ROUND_KEYS, #4
 258 .endif
 259 .endif
 260
 261         // Load the index vector for vtbl-based 8-bit rotates
 262 .if \decrypting
 263         ldr             r12, =.Lrol\n\()_8_table
 264 .else
 265         ldr             r12, =.Lror\n\()_8_table
 266 .endif
 267         vld1.8          {ROTATE_TABLE}, [r12:64]
 268
 269         // One-time XTS preparation
 270
 271         /*
 272          * Allocate stack space to store 128 bytes worth of tweaks.  For
 273          * performance, this space is aligned to a 16-byte boundary so that we
 274          * can use the load/store instructions that declare 16-byte alignment.
 275          * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
 276          */
 277         sub             r12, sp, #128
 278         bic             r12, #0xf
 279         mov             sp, r12
 280
 281 .if \n == 64
 282         // Load first tweak
 283         vld1.8          {TWEAKV}, [TWEAK]
 284
 285         // Load GF(2^128) multiplication table
 286         ldr             r12, =.Lgf128mul_table
 287         vld1.8          {GF128MUL_TABLE}, [r12:64]
 288 .else
 289         // Load first tweak
 290         vld1.8          {TWEAKV_L}, [TWEAK]
 291
 292         // Load GF(2^64) multiplication table
 293         ldr             r12, =.Lgf64mul_table
 294         vld1.8          {GF64MUL_TABLE}, [r12:64]
 295
 296         // Calculate second tweak, packing it together with the first
 297         vshr.u64        TMP0_L, TWEAKV_L, #63
 298         vtbl.u8         TMP0_L, {GF64MUL_TABLE}, TMP0_L
 299         vshl.u64        TWEAKV_H, TWEAKV_L, #1
 300         veor            TWEAKV_H, TMP0_L
 301 .endif
 302
 303 .Lnext_128bytes_\@:
 304
 305         /*
 306          * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
 307          * values, and save the tweaks on the stack for later.  Then
 308          * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
 309          * that the X[0-3] registers contain only the second halves of blocks,
 310          * and the Y[0-3] registers contain only the first halves of blocks.
 311          * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
 312          */
 313         mov             r12, sp
 314 .if \n == 64
 315         _xts128_precrypt_one    X0, r12, TMP0
 316         _xts128_precrypt_one    Y0, r12, TMP0
 317         _xts128_precrypt_one    X1, r12, TMP0
 318         _xts128_precrypt_one    Y1, r12, TMP0
 319         _xts128_precrypt_one    X2, r12, TMP0
 320         _xts128_precrypt_one    Y2, r12, TMP0
 321         _xts128_precrypt_one    X3, r12, TMP0
 322         _xts128_precrypt_one    Y3, r12, TMP0
 323         vswp            X0_L, Y0_H
 324         vswp            X1_L, Y1_H
 325         vswp            X2_L, Y2_H
 326         vswp            X3_L, Y3_H
 327 .else
 328         _xts64_precrypt_two     X0, r12, TMP0
 329         _xts64_precrypt_two     Y0, r12, TMP0
 330         _xts64_precrypt_two     X1, r12, TMP0
 331         _xts64_precrypt_two     Y1, r12, TMP0
 332         _xts64_precrypt_two     X2, r12, TMP0
 333         _xts64_precrypt_two     Y2, r12, TMP0
 334         _xts64_precrypt_two     X3, r12, TMP0
 335         _xts64_precrypt_two     Y3, r12, TMP0
 336         vuzp.32         Y0, X0
 337         vuzp.32         Y1, X1
 338         vuzp.32         Y2, X2
 339         vuzp.32         Y3, X3
 340 .endif
 341
 342         // Do the cipher rounds
 343
 344         mov             r12, ROUND_KEYS
 345         mov             r6, NROUNDS
 346
 347 .Lnext_round_\@:
 348 .if \decrypting
 349 .if \n == 64
 350         vld1.64         ROUND_KEY_L, [r12]
 351         sub             r12, #8
 352         vmov            ROUND_KEY_H, ROUND_KEY_L
 353 .else
 354         vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
 355         sub             r12, #4
 356 .endif
 357         _speck_unround_128bytes \n
 358 .else
 359 .if \n == 64
 360         vld1.64         ROUND_KEY_L, [r12]!
 361         vmov            ROUND_KEY_H, ROUND_KEY_L
 362 .else
 363         vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
 364 .endif
 365         _speck_round_128bytes   \n
 366 .endif
 367         subs            r6, r6, #1
 368         bne             .Lnext_round_\@
 369
 370         // Re-interleave the 'x' and 'y' elements of each block
 371 .if \n == 64
 372         vswp            X0_L, Y0_H
 373         vswp            X1_L, Y1_H
 374         vswp            X2_L, Y2_H
 375         vswp            X3_L, Y3_H
 376 .else
 377         vzip.32         Y0, X0
 378         vzip.32         Y1, X1
 379         vzip.32         Y2, X2
 380         vzip.32         Y3, X3
 381 .endif
 382
 383         // XOR the encrypted/decrypted blocks with the tweaks we saved earlier
 384         mov             r12, sp
 385         vld1.8          {TMP0, TMP1}, [r12:128]!
 386         vld1.8          {TMP2, TMP3}, [r12:128]!
 387         veor            X0, TMP0
 388         veor            Y0, TMP1
 389         veor            X1, TMP2
 390         veor            Y1, TMP3
 391         vld1.8          {TMP0, TMP1}, [r12:128]!
 392         vld1.8          {TMP2, TMP3}, [r12:128]!
 393         veor            X2, TMP0
 394         veor            Y2, TMP1
 395         veor            X3, TMP2
 396         veor            Y3, TMP3
 397
 398         // Store the ciphertext in the destination buffer
 399         vst1.8          {X0, Y0}, [DST]!
 400         vst1.8          {X1, Y1}, [DST]!
 401         vst1.8          {X2, Y2}, [DST]!
 402         vst1.8          {X3, Y3}, [DST]!
 403
 404         // Continue if there are more 128-byte chunks remaining, else return
 405         subs            NBYTES, #128
 406         bne             .Lnext_128bytes_\@
 407
 408         // Store the next tweak
 409 .if \n == 64
 410         vst1.8          {TWEAKV}, [TWEAK]
 411 .else
 412         vst1.8          {TWEAKV_L}, [TWEAK]
 413 .endif
 414
 415         mov             sp, r7
 416         pop             {r4-r7}
 417         bx              lr
 418 .endm
 419
 420 ENTRY(speck128_xts_encrypt_neon)
 421         _speck_xts_crypt        n=64, decrypting=0
 422 ENDPROC(speck128_xts_encrypt_neon)
 423
 424 ENTRY(speck128_xts_decrypt_neon)
 425         _speck_xts_crypt        n=64, decrypting=1
 426 ENDPROC(speck128_xts_decrypt_neon)
 427
 428 ENTRY(speck64_xts_encrypt_neon)
 429         _speck_xts_crypt        n=32, decrypting=0
 430 ENDPROC(speck64_xts_encrypt_neon)
 431
 432 ENTRY(speck64_xts_decrypt_neon)
 433         _speck_xts_crypt        n=32, decrypting=1
 434 ENDPROC(speck64_xts_decrypt_neon)