arch/arm64/crypto/crct10dif-ce-core.S

   1 //
   2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
   3 //
   4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5 //
   6 // This program is free software; you can redistribute it and/or modify
   7 // it under the terms of the GNU General Public License version 2 as
   8 // published by the Free Software Foundation.
   9 //
  10
  11 //
  12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  13 //
  14 // Copyright (c) 2013, Intel Corporation
  15 //
  16 // Authors:
  17 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
  18 //     Vinodh Gopal <vinodh.gopal@intel.com>
  19 //     James Guilford <james.guilford@intel.com>
  20 //     Tim Chen <tim.c.chen@linux.intel.com>
  21 //
  22 // This software is available to you under a choice of one of two
  23 // licenses.  You may choose to be licensed under the terms of the GNU
  24 // General Public License (GPL) Version 2, available from the file
  25 // COPYING in the main directory of this source tree, or the
  26 // OpenIB.org BSD license below:
  27 //
  28 // Redistribution and use in source and binary forms, with or without
  29 // modification, are permitted provided that the following conditions are
  30 // met:
  31 //
  32 // * Redistributions of source code must retain the above copyright
  33 //   notice, this list of conditions and the following disclaimer.
  34 //
  35 // * Redistributions in binary form must reproduce the above copyright
  36 //   notice, this list of conditions and the following disclaimer in the
  37 //   documentation and/or other materials provided with the
  38 //   distribution.
  39 //
  40 // * Neither the name of the Intel Corporation nor the names of its
  41 //   contributors may be used to endorse or promote products derived from
  42 //   this software without specific prior written permission.
  43 //
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  56 //
  57 //       Function API:
  58 //       UINT16 crc_t10dif_pcl(
  59 //               UINT16 init_crc, //initial CRC value, 16 bits
  60 //               const unsigned char *buf, //buffer pointer to calculate CRC on
  61 //               UINT64 len //buffer length in bytes (64-bit data)
  62 //       );
  63 //
  64 //       Reference paper titled "Fast CRC Computation for Generic
  65 //      Polynomials Using PCLMULQDQ Instruction"
  66 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
  67 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  68 //
  69 //
  70
  71 #include <linux/linkage.h>
  72 #include <asm/assembler.h>
  73
  74         .text
  75         .cpu            generic+crypto
  76
  77         arg1_low32      .req    w0
  78         arg2            .req    x1
  79         arg3            .req    x2
  80
  81         vzr             .req    v13
  82
  83 ENTRY(crc_t10dif_pmull)
  84         movi            vzr.16b, #0             // init zero register
  85
  86         // adjust the 16-bit initial_crc value, scale it to 32 bits
  87         lsl             arg1_low32, arg1_low32, #16
  88
  89         // check if smaller than 256
  90         cmp             arg3, #256
  91
  92         // for sizes less than 128, we can't fold 64B at a time...
  93         b.lt            _less_than_128
  94
  95         // load the initial crc value
  96         // crc value does not need to be byte-reflected, but it needs
  97         // to be moved to the high part of the register.
  98         // because data will be byte-reflected and will align with
  99         // initial crc at correct place.
 100         movi            v10.16b, #0
 101         mov             v10.s[3], arg1_low32            // initial crc
 102
 103         // receive the initial 64B data, xor the initial crc value
 104         ldp             q0, q1, [arg2]
 105         ldp             q2, q3, [arg2, #0x20]
 106         ldp             q4, q5, [arg2, #0x40]
 107         ldp             q6, q7, [arg2, #0x60]
 108         add             arg2, arg2, #0x80
 109
 110 CPU_LE( rev64           v0.16b, v0.16b                  )
 111 CPU_LE( rev64           v1.16b, v1.16b                  )
 112 CPU_LE( rev64           v2.16b, v2.16b                  )
 113 CPU_LE( rev64           v3.16b, v3.16b                  )
 114 CPU_LE( rev64           v4.16b, v4.16b                  )
 115 CPU_LE( rev64           v5.16b, v5.16b                  )
 116 CPU_LE( rev64           v6.16b, v6.16b                  )
 117 CPU_LE( rev64           v7.16b, v7.16b                  )
 118
 119 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 120 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 121 CPU_LE( ext             v2.16b, v2.16b, v2.16b, #8      )
 122 CPU_LE( ext             v3.16b, v3.16b, v3.16b, #8      )
 123 CPU_LE( ext             v4.16b, v4.16b, v4.16b, #8      )
 124 CPU_LE( ext             v5.16b, v5.16b, v5.16b, #8      )
 125 CPU_LE( ext             v6.16b, v6.16b, v6.16b, #8      )
 126 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 127
 128         // XOR the initial_crc value
 129         eor             v0.16b, v0.16b, v10.16b
 130
 131         ldr_l           q10, rk3, x8    // xmm10 has rk3 and rk4
 132                                         // type of pmull instruction
 133                                         // will determine which constant to use
 134
 135         //
 136         // we subtract 256 instead of 128 to save one instruction from the loop
 137         //
 138         sub             arg3, arg3, #256
 139
 140         // at this section of the code, there is 64*x+y (0<=y<64) bytes of
 141         // buffer. The _fold_64_B_loop will fold 64B at a time
 142         // until we have 64+y Bytes of buffer
 143
 144
 145         // fold 64B at a time. This section of the code folds 4 vector
 146         // registers in parallel
 147 _fold_64_B_loop:
 148
 149         .macro          fold64, reg1, reg2
 150         ldp             q11, q12, [arg2], #0x20
 151
 152         pmull2          v8.1q, \reg1\().2d, v10.2d
 153         pmull           \reg1\().1q, \reg1\().1d, v10.1d
 154
 155 CPU_LE( rev64           v11.16b, v11.16b                )
 156 CPU_LE( rev64           v12.16b, v12.16b                )
 157
 158         pmull2          v9.1q, \reg2\().2d, v10.2d
 159         pmull           \reg2\().1q, \reg2\().1d, v10.1d
 160
 161 CPU_LE( ext             v11.16b, v11.16b, v11.16b, #8   )
 162 CPU_LE( ext             v12.16b, v12.16b, v12.16b, #8   )
 163
 164         eor             \reg1\().16b, \reg1\().16b, v8.16b
 165         eor             \reg2\().16b, \reg2\().16b, v9.16b
 166         eor             \reg1\().16b, \reg1\().16b, v11.16b
 167         eor             \reg2\().16b, \reg2\().16b, v12.16b
 168         .endm
 169
 170         fold64          v0, v1
 171         fold64          v2, v3
 172         fold64          v4, v5
 173         fold64          v6, v7
 174
 175         subs            arg3, arg3, #128
 176
 177         // check if there is another 64B in the buffer to be able to fold
 178         b.ge            _fold_64_B_loop
 179
 180         // at this point, the buffer pointer is pointing at the last y Bytes
 181         // of the buffer the 64B of folded data is in 4 of the vector
 182         // registers: v0, v1, v2, v3
 183
 184         // fold the 8 vector registers to 1 vector register with different
 185         // constants
 186
 187         ldr_l           q10, rk9, x8
 188
 189         .macro          fold16, reg, rk
 190         pmull           v8.1q, \reg\().1d, v10.1d
 191         pmull2          \reg\().1q, \reg\().2d, v10.2d
 192         .ifnb           \rk
 193         ldr_l           q10, \rk, x8
 194         .endif
 195         eor             v7.16b, v7.16b, v8.16b
 196         eor             v7.16b, v7.16b, \reg\().16b
 197         .endm
 198
 199         fold16          v0, rk11
 200         fold16          v1, rk13
 201         fold16          v2, rk15
 202         fold16          v3, rk17
 203         fold16          v4, rk19
 204         fold16          v5, rk1
 205         fold16          v6
 206
 207         // instead of 64, we add 48 to the loop counter to save 1 instruction
 208         // from the loop instead of a cmp instruction, we use the negative
 209         // flag with the jl instruction
 210         adds            arg3, arg3, #(128-16)
 211         b.lt            _final_reduction_for_128
 212
 213         // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 214         // and the rest is in memory. We can fold 16 bytes at a time if y>=16
 215         // continue folding 16B at a time
 216
 217 _16B_reduction_loop:
 218         pmull           v8.1q, v7.1d, v10.1d
 219         pmull2          v7.1q, v7.2d, v10.2d
 220         eor             v7.16b, v7.16b, v8.16b
 221
 222         ldr             q0, [arg2], #16
 223 CPU_LE( rev64           v0.16b, v0.16b                  )
 224 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 225         eor             v7.16b, v7.16b, v0.16b
 226         subs            arg3, arg3, #16
 227
 228         // instead of a cmp instruction, we utilize the flags with the
 229         // jge instruction equivalent of: cmp arg3, 16-16
 230         // check if there is any more 16B in the buffer to be able to fold
 231         b.ge            _16B_reduction_loop
 232
 233         // now we have 16+z bytes left to reduce, where 0<= z < 16.
 234         // first, we reduce the data in the xmm7 register
 235
 236 _final_reduction_for_128:
 237         // check if any more data to fold. If not, compute the CRC of
 238         // the final 128 bits
 239         adds            arg3, arg3, #16
 240         b.eq            _128_done
 241
 242         // here we are getting data that is less than 16 bytes.
 243         // since we know that there was data before the pointer, we can
 244         // offset the input pointer before the actual point, to receive
 245         // exactly 16 bytes. after that the registers need to be adjusted.
 246 _get_last_two_regs:
 247         add             arg2, arg2, arg3
 248         ldr             q1, [arg2, #-16]
 249 CPU_LE( rev64           v1.16b, v1.16b                  )
 250 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 251
 252         // get rid of the extra data that was loaded before
 253         // load the shift constant
 254         adr_l           x4, tbl_shf_table + 16
 255         sub             x4, x4, arg3
 256         ld1             {v0.16b}, [x4]
 257
 258         // shift v2 to the left by arg3 bytes
 259         tbl             v2.16b, {v7.16b}, v0.16b
 260
 261         // shift v7 to the right by 16-arg3 bytes
 262         movi            v9.16b, #0x80
 263         eor             v0.16b, v0.16b, v9.16b
 264         tbl             v7.16b, {v7.16b}, v0.16b
 265
 266         // blend
 267         sshr            v0.16b, v0.16b, #7      // convert to 8-bit mask
 268         bsl             v0.16b, v2.16b, v1.16b
 269
 270         // fold 16 Bytes
 271         pmull           v8.1q, v7.1d, v10.1d
 272         pmull2          v7.1q, v7.2d, v10.2d
 273         eor             v7.16b, v7.16b, v8.16b
 274         eor             v7.16b, v7.16b, v0.16b
 275
 276 _128_done:
 277         // compute crc of a 128-bit value
 278         ldr_l           q10, rk5, x8            // rk5 and rk6 in xmm10
 279
 280         // 64b fold
 281         ext             v0.16b, vzr.16b, v7.16b, #8
 282         mov             v7.d[0], v7.d[1]
 283         pmull           v7.1q, v7.1d, v10.1d
 284         eor             v7.16b, v7.16b, v0.16b
 285
 286         // 32b fold
 287         ext             v0.16b, v7.16b, vzr.16b, #4
 288         mov             v7.s[3], vzr.s[0]
 289         pmull2          v0.1q, v0.2d, v10.2d
 290         eor             v7.16b, v7.16b, v0.16b
 291
 292         // barrett reduction
 293 _barrett:
 294         ldr_l           q10, rk7, x8
 295         mov             v0.d[0], v7.d[1]
 296
 297         pmull           v0.1q, v0.1d, v10.1d
 298         ext             v0.16b, vzr.16b, v0.16b, #12
 299         pmull2          v0.1q, v0.2d, v10.2d
 300         ext             v0.16b, vzr.16b, v0.16b, #12
 301         eor             v7.16b, v7.16b, v0.16b
 302         mov             w0, v7.s[1]
 303
 304 _cleanup:
 305         // scale the result back to 16 bits
 306         lsr             x0, x0, #16
 307         ret
 308
 309 _less_than_128:
 310         cbz             arg3, _cleanup
 311
 312         movi            v0.16b, #0
 313         mov             v0.s[3], arg1_low32     // get the initial crc value
 314
 315         ldr             q7, [arg2], #0x10
 316 CPU_LE( rev64           v7.16b, v7.16b                  )
 317 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 318         eor             v7.16b, v7.16b, v0.16b  // xor the initial crc value
 319
 320         cmp             arg3, #16
 321         b.eq            _128_done               // exactly 16 left
 322         b.lt            _less_than_16_left
 323
 324         ldr_l           q10, rk1, x8            // rk1 and rk2 in xmm10
 325
 326         // update the counter. subtract 32 instead of 16 to save one
 327         // instruction from the loop
 328         subs            arg3, arg3, #32
 329         b.ge            _16B_reduction_loop
 330
 331         add             arg3, arg3, #16
 332         b               _get_last_two_regs
 333
 334 _less_than_16_left:
 335         // shl r9, 4
 336         adr_l           x0, tbl_shf_table + 16
 337         sub             x0, x0, arg3
 338         ld1             {v0.16b}, [x0]
 339         movi            v9.16b, #0x80
 340         eor             v0.16b, v0.16b, v9.16b
 341         tbl             v7.16b, {v7.16b}, v0.16b
 342         b               _128_done
 343 ENDPROC(crc_t10dif_pmull)
 344
 345 // precomputed constants
 346 // these constants are precomputed from the poly:
 347 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
 348         .section        ".rodata", "a"
 349         .align          4
 350 // Q = 0x18BB70000
 351 // rk1 = 2^(32*3) mod Q << 32
 352 // rk2 = 2^(32*5) mod Q << 32
 353 // rk3 = 2^(32*15) mod Q << 32
 354 // rk4 = 2^(32*17) mod Q << 32
 355 // rk5 = 2^(32*3) mod Q << 32
 356 // rk6 = 2^(32*2) mod Q << 32
 357 // rk7 = floor(2^64/Q)
 358 // rk8 = Q
 359
 360 rk1:    .octa           0x06df0000000000002d56000000000000
 361 rk3:    .octa           0x7cf50000000000009d9d000000000000
 362 rk5:    .octa           0x13680000000000002d56000000000000
 363 rk7:    .octa           0x000000018bb7000000000001f65a57f8
 364 rk9:    .octa           0xbfd6000000000000ceae000000000000
 365 rk11:   .octa           0x713c0000000000001e16000000000000
 366 rk13:   .octa           0x80a6000000000000f7f9000000000000
 367 rk15:   .octa           0xe658000000000000044c000000000000
 368 rk17:   .octa           0xa497000000000000ad18000000000000
 369 rk19:   .octa           0xe7b50000000000006ee3000000000000
 370
 371 tbl_shf_table:
 372 // use these values for shift constants for the tbl/tbx instruction
 373 // different alignments result in values as shown:
 374 //      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
 375 //      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
 376 //      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
 377 //      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
 378 //      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
 379 //      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
 380 //      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
 381 //      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
 382 //      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
 383 //      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
 384 //      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
 385 //      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
 386 //      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
 387 //      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
 388 //      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 389
 390         .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 391         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 392         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 393         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0