arch/arm/crypto/ghash-ce-core.S

   1 /*
   2  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
   3  *
   4  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 as published
   8  * by the Free Software Foundation.
   9  */
  10
  11 #include <linux/linkage.h>
  12 #include <asm/assembler.h>
  13
  14         SHASH           .req    q0
  15         T1              .req    q1
  16         XL              .req    q2
  17         XM              .req    q3
  18         XH              .req    q4
  19         IN1             .req    q4
  20
  21         SHASH_L         .req    d0
  22         SHASH_H         .req    d1
  23         T1_L            .req    d2
  24         T1_H            .req    d3
  25         XL_L            .req    d4
  26         XL_H            .req    d5
  27         XM_L            .req    d6
  28         XM_H            .req    d7
  29         XH_L            .req    d8
  30
  31         t0l             .req    d10
  32         t0h             .req    d11
  33         t1l             .req    d12
  34         t1h             .req    d13
  35         t2l             .req    d14
  36         t2h             .req    d15
  37         t3l             .req    d16
  38         t3h             .req    d17
  39         t4l             .req    d18
  40         t4h             .req    d19
  41
  42         t0q             .req    q5
  43         t1q             .req    q6
  44         t2q             .req    q7
  45         t3q             .req    q8
  46         t4q             .req    q9
  47         T2              .req    q9
  48
  49         s1l             .req    d20
  50         s1h             .req    d21
  51         s2l             .req    d22
  52         s2h             .req    d23
  53         s3l             .req    d24
  54         s3h             .req    d25
  55         s4l             .req    d26
  56         s4h             .req    d27
  57
  58         MASK            .req    d28
  59         SHASH2_p8       .req    d28
  60
  61         k16             .req    d29
  62         k32             .req    d30
  63         k48             .req    d31
  64         SHASH2_p64      .req    d31
  65
  66         HH              .req    q10
  67         HH3             .req    q11
  68         HH4             .req    q12
  69         HH34            .req    q13
  70
  71         HH_L            .req    d20
  72         HH_H            .req    d21
  73         HH3_L           .req    d22
  74         HH3_H           .req    d23
  75         HH4_L           .req    d24
  76         HH4_H           .req    d25
  77         HH34_L          .req    d26
  78         HH34_H          .req    d27
  79         SHASH2_H        .req    d29
  80
  81         XL2             .req    q5
  82         XM2             .req    q6
  83         XH2             .req    q7
  84         T3              .req    q8
  85
  86         XL2_L           .req    d10
  87         XL2_H           .req    d11
  88         XM2_L           .req    d12
  89         XM2_H           .req    d13
  90         T3_L            .req    d16
  91         T3_H            .req    d17
  92
  93         .text
  94         .fpu            crypto-neon-fp-armv8
  95
  96         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
  97         vmull.p64       \rd, \rn, \rm
  98         .endm
  99
 100         /*
 101          * This implementation of 64x64 -> 128 bit polynomial multiplication
 102          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
 103          * "Fast Software Polynomial Multiplication on ARM Processors Using
 104          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
 105          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
 106          *
 107          * It has been slightly tweaked for in-order performance, and to allow
 108          * 'rq' to overlap with 'ad' or 'bd'.
 109          */
 110         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
 111         vext.8          t0l, \ad, \ad, #1       @ A1
 112         .ifc            \b1, t4l
 113         vext.8          t4l, \bd, \bd, #1       @ B1
 114         .endif
 115         vmull.p8        t0q, t0l, \bd           @ F = A1*B
 116         vext.8          t1l, \ad, \ad, #2       @ A2
 117         vmull.p8        t4q, \ad, \b1           @ E = A*B1
 118         .ifc            \b2, t3l
 119         vext.8          t3l, \bd, \bd, #2       @ B2
 120         .endif
 121         vmull.p8        t1q, t1l, \bd           @ H = A2*B
 122         vext.8          t2l, \ad, \ad, #3       @ A3
 123         vmull.p8        t3q, \ad, \b2           @ G = A*B2
 124         veor            t0q, t0q, t4q           @ L = E + F
 125         .ifc            \b3, t4l
 126         vext.8          t4l, \bd, \bd, #3       @ B3
 127         .endif
 128         vmull.p8        t2q, t2l, \bd           @ J = A3*B
 129         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
 130         veor            t1q, t1q, t3q           @ M = G + H
 131         .ifc            \b4, t3l
 132         vext.8          t3l, \bd, \bd, #4       @ B4
 133         .endif
 134         vmull.p8        t4q, \ad, \b3           @ I = A*B3
 135         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
 136         vmull.p8        t3q, \ad, \b4           @ K = A*B4
 137         vand            t0h, t0h, k48
 138         vand            t1h, t1h, k32
 139         veor            t2q, t2q, t4q           @ N = I + J
 140         veor            t0l, t0l, t0h
 141         veor            t1l, t1l, t1h
 142         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
 143         vand            t2h, t2h, k16
 144         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
 145         vmov.i64        t3h, #0
 146         vext.8          t0q, t0q, t0q, #15
 147         veor            t2l, t2l, t2h
 148         vext.8          t1q, t1q, t1q, #14
 149         vmull.p8        \rq, \ad, \bd           @ D = A*B
 150         vext.8          t2q, t2q, t2q, #13
 151         vext.8          t3q, t3q, t3q, #12
 152         veor            t0q, t0q, t1q
 153         veor            t2q, t2q, t3q
 154         veor            \rq, \rq, t0q
 155         veor            \rq, \rq, t2q
 156         .endm
 157
 158         //
 159         // PMULL (64x64->128) based reduction for CPUs that can do
 160         // it in a single instruction.
 161         //
 162         .macro          __pmull_reduce_p64
 163         vmull.p64       T1, XL_L, MASK
 164
 165         veor            XH_L, XH_L, XM_H
 166         vext.8          T1, T1, T1, #8
 167         veor            XL_H, XL_H, XM_L
 168         veor            T1, T1, XL
 169
 170         vmull.p64       XL, T1_H, MASK
 171         .endm
 172
 173         //
 174         // Alternative reduction for CPUs that lack support for the
 175         // 64x64->128 PMULL instruction
 176         //
 177         .macro          __pmull_reduce_p8
 178         veor            XL_H, XL_H, XM_L
 179         veor            XH_L, XH_L, XM_H
 180
 181         vshl.i64        T1, XL, #57
 182         vshl.i64        T2, XL, #62
 183         veor            T1, T1, T2
 184         vshl.i64        T2, XL, #63
 185         veor            T1, T1, T2
 186         veor            XL_H, XL_H, T1_L
 187         veor            XH_L, XH_L, T1_H
 188
 189         vshr.u64        T1, XL, #1
 190         veor            XH, XH, XL
 191         veor            XL, XL, T1
 192         vshr.u64        T1, T1, #6
 193         vshr.u64        XL, XL, #1
 194         .endm
 195
 196         .macro          ghash_update, pn
 197         vld1.64         {XL}, [r1]
 198
 199         /* do the head block first, if supplied */
 200         ldr             ip, [sp]
 201         teq             ip, #0
 202         beq             0f
 203         vld1.64         {T1}, [ip]
 204         teq             r0, #0
 205         b               3f
 206
 207 0:      .ifc            \pn, p64
 208         tst             r0, #3                  // skip until #blocks is a
 209         bne             2f                      // round multiple of 4
 210
 211         vld1.8          {XL2-XM2}, [r2]!
 212 1:      vld1.8          {T3-T2}, [r2]!
 213         vrev64.8        XL2, XL2
 214         vrev64.8        XM2, XM2
 215
 216         subs            r0, r0, #4
 217
 218         vext.8          T1, XL2, XL2, #8
 219         veor            XL2_H, XL2_H, XL_L
 220         veor            XL, XL, T1
 221
 222         vrev64.8        T3, T3
 223         vrev64.8        T1, T2
 224
 225         vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
 226         veor            XL2_H, XL2_H, XL_H
 227         vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
 228         vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)
 229
 230         vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
 231         veor            XM2_L, XM2_L, XM2_H
 232         vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
 233         vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)
 234
 235         veor            XH, XH, XH2
 236         veor            XL, XL, XL2
 237         veor            XM, XM, XM2
 238
 239         vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
 240         veor            T3_L, T3_L, T3_H
 241         vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
 242         vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)
 243
 244         veor            XH, XH, XH2
 245         veor            XL, XL, XL2
 246         veor            XM, XM, XM2
 247
 248         vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
 249         veor            T1_L, T1_L, T1_H
 250         vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
 251         vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)
 252
 253         veor            XH, XH, XH2
 254         veor            XL, XL, XL2
 255         veor            XM, XM, XM2
 256
 257         beq             4f
 258
 259         vld1.8          {XL2-XM2}, [r2]!
 260
 261         veor            T1, XL, XH
 262         veor            XM, XM, T1
 263
 264         __pmull_reduce_p64
 265
 266         veor            T1, T1, XH
 267         veor            XL, XL, T1
 268
 269         b               1b
 270         .endif
 271
 272 2:      vld1.64         {T1}, [r2]!
 273         subs            r0, r0, #1
 274
 275 3:      /* multiply XL by SHASH in GF(2^128) */
 276 #ifndef CONFIG_CPU_BIG_ENDIAN
 277         vrev64.8        T1, T1
 278 #endif
 279         vext.8          IN1, T1, T1, #8
 280         veor            T1_L, T1_L, XL_H
 281         veor            XL, XL, IN1
 282
 283         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
 284         veor            T1, T1, XL
 285         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
 286         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
 287
 288 4:      veor            T1, XL, XH
 289         veor            XM, XM, T1
 290
 291         __pmull_reduce_\pn
 292
 293         veor            T1, T1, XH
 294         veor            XL, XL, T1
 295
 296         bne             0b
 297
 298         vst1.64         {XL}, [r1]
 299         bx              lr
 300         .endm
 301
 302         /*
 303          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 304          *                         struct ghash_key const *k, const char *head)
 305          */
 306 ENTRY(pmull_ghash_update_p64)
 307         vld1.64         {SHASH}, [r3]!
 308         vld1.64         {HH}, [r3]!
 309         vld1.64         {HH3-HH4}, [r3]
 310
 311         veor            SHASH2_p64, SHASH_L, SHASH_H
 312         veor            SHASH2_H, HH_L, HH_H
 313         veor            HH34_L, HH3_L, HH3_H
 314         veor            HH34_H, HH4_L, HH4_H
 315
 316         vmov.i8         MASK, #0xe1
 317         vshl.u64        MASK, MASK, #57
 318
 319         ghash_update    p64
 320 ENDPROC(pmull_ghash_update_p64)
 321
 322 ENTRY(pmull_ghash_update_p8)
 323         vld1.64         {SHASH}, [r3]
 324         veor            SHASH2_p8, SHASH_L, SHASH_H
 325
 326         vext.8          s1l, SHASH_L, SHASH_L, #1
 327         vext.8          s2l, SHASH_L, SHASH_L, #2
 328         vext.8          s3l, SHASH_L, SHASH_L, #3
 329         vext.8          s4l, SHASH_L, SHASH_L, #4
 330         vext.8          s1h, SHASH_H, SHASH_H, #1
 331         vext.8          s2h, SHASH_H, SHASH_H, #2
 332         vext.8          s3h, SHASH_H, SHASH_H, #3
 333         vext.8          s4h, SHASH_H, SHASH_H, #4
 334
 335         vmov.i64        k16, #0xffff
 336         vmov.i64        k32, #0xffffffff
 337         vmov.i64        k48, #0xffffffffffff
 338
 339         ghash_update    p8
 340 ENDPROC(pmull_ghash_update_p8)