Merge tag '9p-for-4.20' of git://github.com/martinetd/linux
[sfrench/cifs-2.6.git] / arch / arm64 / crypto / crct10dif-ce-core.S
1 //
2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3 //
4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 //
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License version 2 as
8 // published by the Free Software Foundation.
9 //
10
11 //
12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13 //
14 // Copyright (c) 2013, Intel Corporation
15 //
16 // Authors:
17 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
18 //     Vinodh Gopal <vinodh.gopal@intel.com>
19 //     James Guilford <james.guilford@intel.com>
20 //     Tim Chen <tim.c.chen@linux.intel.com>
21 //
22 // This software is available to you under a choice of one of two
23 // licenses.  You may choose to be licensed under the terms of the GNU
24 // General Public License (GPL) Version 2, available from the file
25 // COPYING in the main directory of this source tree, or the
26 // OpenIB.org BSD license below:
27 //
28 // Redistribution and use in source and binary forms, with or without
29 // modification, are permitted provided that the following conditions are
30 // met:
31 //
32 // * Redistributions of source code must retain the above copyright
33 //   notice, this list of conditions and the following disclaimer.
34 //
35 // * Redistributions in binary form must reproduce the above copyright
36 //   notice, this list of conditions and the following disclaimer in the
37 //   documentation and/or other materials provided with the
38 //   distribution.
39 //
40 // * Neither the name of the Intel Corporation nor the names of its
41 //   contributors may be used to endorse or promote products derived from
42 //   this software without specific prior written permission.
43 //
44 //
45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56 //
57 //       Function API:
58 //       UINT16 crc_t10dif_pcl(
59 //               UINT16 init_crc, //initial CRC value, 16 bits
60 //               const unsigned char *buf, //buffer pointer to calculate CRC on
61 //               UINT64 len //buffer length in bytes (64-bit data)
62 //       );
63 //
64 //       Reference paper titled "Fast CRC Computation for Generic
65 //      Polynomials Using PCLMULQDQ Instruction"
66 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
67 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68 //
69 //
70
71 #include <linux/linkage.h>
72 #include <asm/assembler.h>
73
74         .text
75         .cpu            generic+crypto
76
77         arg1_low32      .req    w19
78         arg2            .req    x20
79         arg3            .req    x21
80
81         vzr             .req    v13
82
83         ad              .req    v14
84         bd              .req    v10
85
86         k00_16          .req    v15
87         k32_48          .req    v16
88
89         t3              .req    v17
90         t4              .req    v18
91         t5              .req    v19
92         t6              .req    v20
93         t7              .req    v21
94         t8              .req    v22
95         t9              .req    v23
96
97         perm1           .req    v24
98         perm2           .req    v25
99         perm3           .req    v26
100         perm4           .req    v27
101
102         bd1             .req    v28
103         bd2             .req    v29
104         bd3             .req    v30
105         bd4             .req    v31
106
107         .macro          __pmull_init_p64
108         .endm
109
110         .macro          __pmull_pre_p64, bd
111         .endm
112
113         .macro          __pmull_init_p8
114         // k00_16 := 0x0000000000000000_000000000000ffff
115         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
116         movi            k32_48.2d, #0xffffffff
117         mov             k32_48.h[2], k32_48.h[0]
118         ushr            k00_16.2d, k32_48.2d, #32
119
120         // prepare the permutation vectors
121         mov_q           x5, 0x080f0e0d0c0b0a09
122         movi            perm4.8b, #8
123         dup             perm1.2d, x5
124         eor             perm1.16b, perm1.16b, perm4.16b
125         ushr            perm2.2d, perm1.2d, #8
126         ushr            perm3.2d, perm1.2d, #16
127         ushr            perm4.2d, perm1.2d, #24
128         sli             perm2.2d, perm1.2d, #56
129         sli             perm3.2d, perm1.2d, #48
130         sli             perm4.2d, perm1.2d, #40
131         .endm
132
133         .macro          __pmull_pre_p8, bd
134         tbl             bd1.16b, {\bd\().16b}, perm1.16b
135         tbl             bd2.16b, {\bd\().16b}, perm2.16b
136         tbl             bd3.16b, {\bd\().16b}, perm3.16b
137         tbl             bd4.16b, {\bd\().16b}, perm4.16b
138         .endm
139
140 __pmull_p8_core:
141 .L__pmull_p8_core:
142         ext             t4.8b, ad.8b, ad.8b, #1                 // A1
143         ext             t5.8b, ad.8b, ad.8b, #2                 // A2
144         ext             t6.8b, ad.8b, ad.8b, #3                 // A3
145
146         pmull           t4.8h, t4.8b, bd.8b                     // F = A1*B
147         pmull           t8.8h, ad.8b, bd1.8b                    // E = A*B1
148         pmull           t5.8h, t5.8b, bd.8b                     // H = A2*B
149         pmull           t7.8h, ad.8b, bd2.8b                    // G = A*B2
150         pmull           t6.8h, t6.8b, bd.8b                     // J = A3*B
151         pmull           t9.8h, ad.8b, bd3.8b                    // I = A*B3
152         pmull           t3.8h, ad.8b, bd4.8b                    // K = A*B4
153         b               0f
154
155 .L__pmull_p8_core2:
156         tbl             t4.16b, {ad.16b}, perm1.16b             // A1
157         tbl             t5.16b, {ad.16b}, perm2.16b             // A2
158         tbl             t6.16b, {ad.16b}, perm3.16b             // A3
159
160         pmull2          t4.8h, t4.16b, bd.16b                   // F = A1*B
161         pmull2          t8.8h, ad.16b, bd1.16b                  // E = A*B1
162         pmull2          t5.8h, t5.16b, bd.16b                   // H = A2*B
163         pmull2          t7.8h, ad.16b, bd2.16b                  // G = A*B2
164         pmull2          t6.8h, t6.16b, bd.16b                   // J = A3*B
165         pmull2          t9.8h, ad.16b, bd3.16b                  // I = A*B3
166         pmull2          t3.8h, ad.16b, bd4.16b                  // K = A*B4
167
168 0:      eor             t4.16b, t4.16b, t8.16b                  // L = E + F
169         eor             t5.16b, t5.16b, t7.16b                  // M = G + H
170         eor             t6.16b, t6.16b, t9.16b                  // N = I + J
171
172         uzp1            t8.2d, t4.2d, t5.2d
173         uzp2            t4.2d, t4.2d, t5.2d
174         uzp1            t7.2d, t6.2d, t3.2d
175         uzp2            t6.2d, t6.2d, t3.2d
176
177         // t4 = (L) (P0 + P1) << 8
178         // t5 = (M) (P2 + P3) << 16
179         eor             t8.16b, t8.16b, t4.16b
180         and             t4.16b, t4.16b, k32_48.16b
181
182         // t6 = (N) (P4 + P5) << 24
183         // t7 = (K) (P6 + P7) << 32
184         eor             t7.16b, t7.16b, t6.16b
185         and             t6.16b, t6.16b, k00_16.16b
186
187         eor             t8.16b, t8.16b, t4.16b
188         eor             t7.16b, t7.16b, t6.16b
189
190         zip2            t5.2d, t8.2d, t4.2d
191         zip1            t4.2d, t8.2d, t4.2d
192         zip2            t3.2d, t7.2d, t6.2d
193         zip1            t6.2d, t7.2d, t6.2d
194
195         ext             t4.16b, t4.16b, t4.16b, #15
196         ext             t5.16b, t5.16b, t5.16b, #14
197         ext             t6.16b, t6.16b, t6.16b, #13
198         ext             t3.16b, t3.16b, t3.16b, #12
199
200         eor             t4.16b, t4.16b, t5.16b
201         eor             t6.16b, t6.16b, t3.16b
202         ret
203 ENDPROC(__pmull_p8_core)
204
205         .macro          __pmull_p8, rq, ad, bd, i
206         .ifnc           \bd, v10
207         .err
208         .endif
209         mov             ad.16b, \ad\().16b
210         .ifb            \i
211         pmull           \rq\().8h, \ad\().8b, bd.8b             // D = A*B
212         .else
213         pmull2          \rq\().8h, \ad\().16b, bd.16b           // D = A*B
214         .endif
215
216         bl              .L__pmull_p8_core\i
217
218         eor             \rq\().16b, \rq\().16b, t4.16b
219         eor             \rq\().16b, \rq\().16b, t6.16b
220         .endm
221
222         .macro          fold64, p, reg1, reg2
223         ldp             q11, q12, [arg2], #0x20
224
225         __pmull_\p      v8, \reg1, v10, 2
226         __pmull_\p      \reg1, \reg1, v10
227
228 CPU_LE( rev64           v11.16b, v11.16b                )
229 CPU_LE( rev64           v12.16b, v12.16b                )
230
231         __pmull_\p      v9, \reg2, v10, 2
232         __pmull_\p      \reg2, \reg2, v10
233
234 CPU_LE( ext             v11.16b, v11.16b, v11.16b, #8   )
235 CPU_LE( ext             v12.16b, v12.16b, v12.16b, #8   )
236
237         eor             \reg1\().16b, \reg1\().16b, v8.16b
238         eor             \reg2\().16b, \reg2\().16b, v9.16b
239         eor             \reg1\().16b, \reg1\().16b, v11.16b
240         eor             \reg2\().16b, \reg2\().16b, v12.16b
241         .endm
242
243         .macro          fold16, p, reg, rk
244         __pmull_\p      v8, \reg, v10
245         __pmull_\p      \reg, \reg, v10, 2
246         .ifnb           \rk
247         ldr_l           q10, \rk, x8
248         __pmull_pre_\p  v10
249         .endif
250         eor             v7.16b, v7.16b, v8.16b
251         eor             v7.16b, v7.16b, \reg\().16b
252         .endm
253
254         .macro          __pmull_p64, rd, rn, rm, n
255         .ifb            \n
256         pmull           \rd\().1q, \rn\().1d, \rm\().1d
257         .else
258         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
259         .endif
260         .endm
261
262         .macro          crc_t10dif_pmull, p
263         frame_push      3, 128
264
265         mov             arg1_low32, w0
266         mov             arg2, x1
267         mov             arg3, x2
268
269         movi            vzr.16b, #0             // init zero register
270
271         __pmull_init_\p
272
273         // adjust the 16-bit initial_crc value, scale it to 32 bits
274         lsl             arg1_low32, arg1_low32, #16
275
276         // check if smaller than 256
277         cmp             arg3, #256
278
279         // for sizes less than 128, we can't fold 64B at a time...
280         b.lt            .L_less_than_128_\@
281
282         // load the initial crc value
283         // crc value does not need to be byte-reflected, but it needs
284         // to be moved to the high part of the register.
285         // because data will be byte-reflected and will align with
286         // initial crc at correct place.
287         movi            v10.16b, #0
288         mov             v10.s[3], arg1_low32            // initial crc
289
290         // receive the initial 64B data, xor the initial crc value
291         ldp             q0, q1, [arg2]
292         ldp             q2, q3, [arg2, #0x20]
293         ldp             q4, q5, [arg2, #0x40]
294         ldp             q6, q7, [arg2, #0x60]
295         add             arg2, arg2, #0x80
296
297 CPU_LE( rev64           v0.16b, v0.16b                  )
298 CPU_LE( rev64           v1.16b, v1.16b                  )
299 CPU_LE( rev64           v2.16b, v2.16b                  )
300 CPU_LE( rev64           v3.16b, v3.16b                  )
301 CPU_LE( rev64           v4.16b, v4.16b                  )
302 CPU_LE( rev64           v5.16b, v5.16b                  )
303 CPU_LE( rev64           v6.16b, v6.16b                  )
304 CPU_LE( rev64           v7.16b, v7.16b                  )
305
306 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
307 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
308 CPU_LE( ext             v2.16b, v2.16b, v2.16b, #8      )
309 CPU_LE( ext             v3.16b, v3.16b, v3.16b, #8      )
310 CPU_LE( ext             v4.16b, v4.16b, v4.16b, #8      )
311 CPU_LE( ext             v5.16b, v5.16b, v5.16b, #8      )
312 CPU_LE( ext             v6.16b, v6.16b, v6.16b, #8      )
313 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
314
315         // XOR the initial_crc value
316         eor             v0.16b, v0.16b, v10.16b
317
318         ldr_l           q10, rk3, x8    // xmm10 has rk3 and rk4
319                                         // type of pmull instruction
320                                         // will determine which constant to use
321         __pmull_pre_\p  v10
322
323         //
324         // we subtract 256 instead of 128 to save one instruction from the loop
325         //
326         sub             arg3, arg3, #256
327
328         // at this section of the code, there is 64*x+y (0<=y<64) bytes of
329         // buffer. The _fold_64_B_loop will fold 64B at a time
330         // until we have 64+y Bytes of buffer
331
332         // fold 64B at a time. This section of the code folds 4 vector
333         // registers in parallel
334 .L_fold_64_B_loop_\@:
335
336         fold64          \p, v0, v1
337         fold64          \p, v2, v3
338         fold64          \p, v4, v5
339         fold64          \p, v6, v7
340
341         subs            arg3, arg3, #128
342
343         // check if there is another 64B in the buffer to be able to fold
344         b.lt            .L_fold_64_B_end_\@
345
346         if_will_cond_yield_neon
347         stp             q0, q1, [sp, #.Lframe_local_offset]
348         stp             q2, q3, [sp, #.Lframe_local_offset + 32]
349         stp             q4, q5, [sp, #.Lframe_local_offset + 64]
350         stp             q6, q7, [sp, #.Lframe_local_offset + 96]
351         do_cond_yield_neon
352         ldp             q0, q1, [sp, #.Lframe_local_offset]
353         ldp             q2, q3, [sp, #.Lframe_local_offset + 32]
354         ldp             q4, q5, [sp, #.Lframe_local_offset + 64]
355         ldp             q6, q7, [sp, #.Lframe_local_offset + 96]
356         ldr_l           q10, rk3, x8
357         movi            vzr.16b, #0             // init zero register
358         __pmull_init_\p
359         __pmull_pre_\p  v10
360         endif_yield_neon
361
362         b               .L_fold_64_B_loop_\@
363
364 .L_fold_64_B_end_\@:
365         // at this point, the buffer pointer is pointing at the last y Bytes
366         // of the buffer the 64B of folded data is in 4 of the vector
367         // registers: v0, v1, v2, v3
368
369         // fold the 8 vector registers to 1 vector register with different
370         // constants
371
372         ldr_l           q10, rk9, x8
373         __pmull_pre_\p  v10
374
375         fold16          \p, v0, rk11
376         fold16          \p, v1, rk13
377         fold16          \p, v2, rk15
378         fold16          \p, v3, rk17
379         fold16          \p, v4, rk19
380         fold16          \p, v5, rk1
381         fold16          \p, v6
382
383         // instead of 64, we add 48 to the loop counter to save 1 instruction
384         // from the loop instead of a cmp instruction, we use the negative
385         // flag with the jl instruction
386         adds            arg3, arg3, #(128-16)
387         b.lt            .L_final_reduction_for_128_\@
388
389         // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
390         // and the rest is in memory. We can fold 16 bytes at a time if y>=16
391         // continue folding 16B at a time
392
393 .L_16B_reduction_loop_\@:
394         __pmull_\p      v8, v7, v10
395         __pmull_\p      v7, v7, v10, 2
396         eor             v7.16b, v7.16b, v8.16b
397
398         ldr             q0, [arg2], #16
399 CPU_LE( rev64           v0.16b, v0.16b                  )
400 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
401         eor             v7.16b, v7.16b, v0.16b
402         subs            arg3, arg3, #16
403
404         // instead of a cmp instruction, we utilize the flags with the
405         // jge instruction equivalent of: cmp arg3, 16-16
406         // check if there is any more 16B in the buffer to be able to fold
407         b.ge            .L_16B_reduction_loop_\@
408
409         // now we have 16+z bytes left to reduce, where 0<= z < 16.
410         // first, we reduce the data in the xmm7 register
411
412 .L_final_reduction_for_128_\@:
413         // check if any more data to fold. If not, compute the CRC of
414         // the final 128 bits
415         adds            arg3, arg3, #16
416         b.eq            .L_128_done_\@
417
418         // here we are getting data that is less than 16 bytes.
419         // since we know that there was data before the pointer, we can
420         // offset the input pointer before the actual point, to receive
421         // exactly 16 bytes. after that the registers need to be adjusted.
422 .L_get_last_two_regs_\@:
423         add             arg2, arg2, arg3
424         ldr             q1, [arg2, #-16]
425 CPU_LE( rev64           v1.16b, v1.16b                  )
426 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
427
428         // get rid of the extra data that was loaded before
429         // load the shift constant
430         adr_l           x4, tbl_shf_table + 16
431         sub             x4, x4, arg3
432         ld1             {v0.16b}, [x4]
433
434         // shift v2 to the left by arg3 bytes
435         tbl             v2.16b, {v7.16b}, v0.16b
436
437         // shift v7 to the right by 16-arg3 bytes
438         movi            v9.16b, #0x80
439         eor             v0.16b, v0.16b, v9.16b
440         tbl             v7.16b, {v7.16b}, v0.16b
441
442         // blend
443         sshr            v0.16b, v0.16b, #7      // convert to 8-bit mask
444         bsl             v0.16b, v2.16b, v1.16b
445
446         // fold 16 Bytes
447         __pmull_\p      v8, v7, v10
448         __pmull_\p      v7, v7, v10, 2
449         eor             v7.16b, v7.16b, v8.16b
450         eor             v7.16b, v7.16b, v0.16b
451
452 .L_128_done_\@:
453         // compute crc of a 128-bit value
454         ldr_l           q10, rk5, x8            // rk5 and rk6 in xmm10
455         __pmull_pre_\p  v10
456
457         // 64b fold
458         ext             v0.16b, vzr.16b, v7.16b, #8
459         mov             v7.d[0], v7.d[1]
460         __pmull_\p      v7, v7, v10
461         eor             v7.16b, v7.16b, v0.16b
462
463         // 32b fold
464         ext             v0.16b, v7.16b, vzr.16b, #4
465         mov             v7.s[3], vzr.s[0]
466         __pmull_\p      v0, v0, v10, 2
467         eor             v7.16b, v7.16b, v0.16b
468
469         // barrett reduction
470         ldr_l           q10, rk7, x8
471         __pmull_pre_\p  v10
472         mov             v0.d[0], v7.d[1]
473
474         __pmull_\p      v0, v0, v10
475         ext             v0.16b, vzr.16b, v0.16b, #12
476         __pmull_\p      v0, v0, v10, 2
477         ext             v0.16b, vzr.16b, v0.16b, #12
478         eor             v7.16b, v7.16b, v0.16b
479         mov             w0, v7.s[1]
480
481 .L_cleanup_\@:
482         // scale the result back to 16 bits
483         lsr             x0, x0, #16
484         frame_pop
485         ret
486
487 .L_less_than_128_\@:
488         cbz             arg3, .L_cleanup_\@
489
490         movi            v0.16b, #0
491         mov             v0.s[3], arg1_low32     // get the initial crc value
492
493         ldr             q7, [arg2], #0x10
494 CPU_LE( rev64           v7.16b, v7.16b                  )
495 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
496         eor             v7.16b, v7.16b, v0.16b  // xor the initial crc value
497
498         cmp             arg3, #16
499         b.eq            .L_128_done_\@          // exactly 16 left
500         b.lt            .L_less_than_16_left_\@
501
502         ldr_l           q10, rk1, x8            // rk1 and rk2 in xmm10
503         __pmull_pre_\p  v10
504
505         // update the counter. subtract 32 instead of 16 to save one
506         // instruction from the loop
507         subs            arg3, arg3, #32
508         b.ge            .L_16B_reduction_loop_\@
509
510         add             arg3, arg3, #16
511         b               .L_get_last_two_regs_\@
512
513 .L_less_than_16_left_\@:
514         // shl r9, 4
515         adr_l           x0, tbl_shf_table + 16
516         sub             x0, x0, arg3
517         ld1             {v0.16b}, [x0]
518         movi            v9.16b, #0x80
519         eor             v0.16b, v0.16b, v9.16b
520         tbl             v7.16b, {v7.16b}, v0.16b
521         b               .L_128_done_\@
522         .endm
523
524 ENTRY(crc_t10dif_pmull_p8)
525         crc_t10dif_pmull        p8
526 ENDPROC(crc_t10dif_pmull_p8)
527
528         .align          5
529 ENTRY(crc_t10dif_pmull_p64)
530         crc_t10dif_pmull        p64
531 ENDPROC(crc_t10dif_pmull_p64)
532
533 // precomputed constants
534 // these constants are precomputed from the poly:
535 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
536         .section        ".rodata", "a"
537         .align          4
538 // Q = 0x18BB70000
539 // rk1 = 2^(32*3) mod Q << 32
540 // rk2 = 2^(32*5) mod Q << 32
541 // rk3 = 2^(32*15) mod Q << 32
542 // rk4 = 2^(32*17) mod Q << 32
543 // rk5 = 2^(32*3) mod Q << 32
544 // rk6 = 2^(32*2) mod Q << 32
545 // rk7 = floor(2^64/Q)
546 // rk8 = Q
547
548 rk1:    .octa           0x06df0000000000002d56000000000000
549 rk3:    .octa           0x7cf50000000000009d9d000000000000
550 rk5:    .octa           0x13680000000000002d56000000000000
551 rk7:    .octa           0x000000018bb7000000000001f65a57f8
552 rk9:    .octa           0xbfd6000000000000ceae000000000000
553 rk11:   .octa           0x713c0000000000001e16000000000000
554 rk13:   .octa           0x80a6000000000000f7f9000000000000
555 rk15:   .octa           0xe658000000000000044c000000000000
556 rk17:   .octa           0xa497000000000000ad18000000000000
557 rk19:   .octa           0xe7b50000000000006ee3000000000000
558
559 tbl_shf_table:
560 // use these values for shift constants for the tbl/tbx instruction
561 // different alignments result in values as shown:
562 //      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
563 //      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
564 //      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
565 //      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
566 //      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
567 //      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
568 //      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
569 //      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
570 //      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
571 //      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
572 //      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
573 //      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
574 //      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
575 //      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
576 //      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
577
578         .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
579         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
580         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
581         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0