Merge tag 'for-linus-4.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / staging / skein / skein_block.c
1 /*
2  ***********************************************************************
3  *
4  * Implementation of the Skein block functions.
5  *
6  * Source code author: Doug Whiting, 2008.
7  *
8  * This algorithm and source code is released to the public domain.
9  *
10  * Compile-time switches:
11  *
12  *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
13  *                    versions use ASM code for block processing
14  *                    [default: use C for all block sizes]
15  *
16  ***********************************************************************
17  */
18
19 #include <linux/string.h>
20 #include <linux/bitops.h>
21 #include "skein_base.h"
22 #include "skein_block.h"
23
24 #ifndef SKEIN_USE_ASM
25 #define SKEIN_USE_ASM   (0) /* default is all C code (no ASM) */
26 #endif
27
28 #ifndef SKEIN_LOOP
29 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
30 #endif
31
32 #define BLK_BITS        (WCNT * 64) /* some useful definitions for code here */
33 #define KW_TWK_BASE     (0)
34 #define KW_KEY_BASE     (3)
35 #define ks              (kw + KW_KEY_BASE)
36 #define ts              (kw + KW_TWK_BASE)
37
38 #ifdef SKEIN_DEBUG
39 #define debug_save_tweak(ctx)       \
40 {                                   \
41         ctx->h.tweak[0] = ts[0];    \
42         ctx->h.tweak[1] = ts[1];    \
43 }
44 #else
45 #define debug_save_tweak(ctx)
46 #endif
47
48 #if !(SKEIN_USE_ASM & 256)
49 #undef  RCNT
50 #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
51 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
52 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
53 #else
54 #define SKEIN_UNROLL_256 (0)
55 #endif
56
57 #if SKEIN_UNROLL_256
58 #if (RCNT % SKEIN_UNROLL_256)
59 #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
60 #endif
61 #endif
62 #define ROUND256(p0, p1, p2, p3, ROT, r_num)         \
63         do {                                         \
64                 X##p0 += X##p1;                      \
65                 X##p1 = rol64(X##p1, ROT##_0);       \
66                 X##p1 ^= X##p0;                      \
67                 X##p2 += X##p3;                      \
68                 X##p3 = rol64(X##p3, ROT##_1);       \
69                 X##p3 ^= X##p2;                      \
70         } while (0)
71
72 #if SKEIN_UNROLL_256 == 0
73 #define R256(p0, p1, p2, p3, ROT, r_num) /* fully unrolled */ \
74         ROUND256(p0, p1, p2, p3, ROT, r_num)
75
76 #define I256(R)                                                         \
77         do {                                                            \
78                 /* inject the key schedule value */                     \
79                 X0   += ks[((R) + 1) % 5];                              \
80                 X1   += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];          \
81                 X2   += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];          \
82                 X3   += ks[((R) + 4) % 5] + (R) + 1;                    \
83         } while (0)
84 #else
85 /* looping version */
86 #define R256(p0, p1, p2, p3, ROT, r_num) ROUND256(p0, p1, p2, p3, ROT, r_num)
87
88 #define I256(R)                                         \
89         do {                                            \
90                 /* inject the key schedule value */     \
91                 X0 += ks[r + (R) + 0];                  \
92                 X1 += ks[r + (R) + 1] + ts[r + (R) + 0];\
93                 X2 += ks[r + (R) + 2] + ts[r + (R) + 1];\
94                 X3 += ks[r + (R) + 3] + r + (R);        \
95                 /* rotate key schedule */               \
96                 ks[r + (R) + 4] = ks[r + (R) - 1];      \
97                 ts[r + (R) + 2] = ts[r + (R) - 1];      \
98         } while (0)
99 #endif
100 #define R256_8_ROUNDS(R)                                \
101         do {                                            \
102                 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
103                 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
104                 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
105                 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
106                 I256(2 * (R));                          \
107                 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
108                 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
109                 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
110                 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
111                 I256(2 * (R) + 1);                      \
112         } while (0)
113
114 #define R256_UNROLL_R(NN)                     \
115         ((SKEIN_UNROLL_256 == 0 &&            \
116         SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
117         (SKEIN_UNROLL_256 > (NN)))
118
119 #if  (SKEIN_UNROLL_256 > 14)
120 #error  "need more unrolling in skein_256_process_block"
121 #endif
122 #endif
123
124 #if !(SKEIN_USE_ASM & 512)
125 #undef  RCNT
126 #define RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
127
128 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
129 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
130 #else
131 #define SKEIN_UNROLL_512 (0)
132 #endif
133
134 #if SKEIN_UNROLL_512
135 #if (RCNT % SKEIN_UNROLL_512)
136 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
137 #endif
138 #endif
139 #define ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num)    \
140         do {                                                    \
141                 X##p0 += X##p1;                                 \
142                 X##p1 = rol64(X##p1, ROT##_0);                  \
143                 X##p1 ^= X##p0;                                 \
144                 X##p2 += X##p3;                                 \
145                 X##p3 = rol64(X##p3, ROT##_1);                  \
146                 X##p3 ^= X##p2;                                 \
147                 X##p4 += X##p5;                                 \
148                 X##p5 = rol64(X##p5, ROT##_2);                  \
149                 X##p5 ^= X##p4;                                 \
150                 X##p6 += X##p7;                                 \
151                 X##p7 = rol64(X##p7, ROT##_3);                  \
152                 X##p7 ^= X##p6;                                 \
153         } while (0)
154
155 #if SKEIN_UNROLL_512 == 0
156 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) /* unrolled */ \
157         ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num)
158
159 #define I512(R)                                                         \
160         do {                                                            \
161                 /* inject the key schedule value */                     \
162                 X0   += ks[((R) + 1) % 9];                              \
163                 X1   += ks[((R) + 2) % 9];                              \
164                 X2   += ks[((R) + 3) % 9];                              \
165                 X3   += ks[((R) + 4) % 9];                              \
166                 X4   += ks[((R) + 5) % 9];                              \
167                 X5   += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];          \
168                 X6   += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];          \
169                 X7   += ks[((R) + 8) % 9] + (R) + 1;                    \
170         } while (0)
171
172 #else /* looping version */
173 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num)                 \
174         ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num)             \
175
176 #define I512(R)                                                         \
177         do {                                                            \
178                 /* inject the key schedule value */                     \
179                 X0   += ks[r + (R) + 0];                                \
180                 X1   += ks[r + (R) + 1];                                \
181                 X2   += ks[r + (R) + 2];                                \
182                 X3   += ks[r + (R) + 3];                                \
183                 X4   += ks[r + (R) + 4];                                \
184                 X5   += ks[r + (R) + 5] + ts[r + (R) + 0];              \
185                 X6   += ks[r + (R) + 6] + ts[r + (R) + 1];              \
186                 X7   += ks[r + (R) + 7] + r + (R);                      \
187                 /* rotate key schedule */                               \
188                 ks[r + (R) + 8] = ks[r + (R) - 1];                      \
189                 ts[r + (R) + 2] = ts[r + (R) - 1];                      \
190         } while (0)
191 #endif /* end of looped code definitions */
192 #define R512_8_ROUNDS(R)  /* do 8 full rounds */                        \
193         do {                                                            \
194                 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);     \
195                 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);     \
196                 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);     \
197                 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);     \
198                 I512(2 * (R));                                          \
199                 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);     \
200                 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);     \
201                 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);     \
202                 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);     \
203                 I512(2 * (R) + 1); /* and key injection */              \
204         } while (0)
205 #define R512_UNROLL_R(NN)                             \
206                 ((SKEIN_UNROLL_512 == 0 &&            \
207                 SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
208                 (SKEIN_UNROLL_512 > (NN)))
209
210 #if  (SKEIN_UNROLL_512 > 14)
211 #error  "need more unrolling in skein_512_process_block"
212 #endif
213 #endif
214
215 #if !(SKEIN_USE_ASM & 1024)
216 #undef  RCNT
217 #define RCNT  (SKEIN_1024_ROUNDS_TOTAL / 8)
218 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
219 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10)
220 #else
221 #define SKEIN_UNROLL_1024 (0)
222 #endif
223
224 #if (SKEIN_UNROLL_1024 != 0)
225 #if (RCNT % SKEIN_UNROLL_1024)
226 #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
227 #endif
228 #endif
229 #define ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \
230                   pF, ROT, r_num)                                             \
231         do {                                                                  \
232                 X##p0 += X##p1;                                               \
233                 X##p1 = rol64(X##p1, ROT##_0);                                \
234                 X##p1 ^= X##p0;                                               \
235                 X##p2 += X##p3;                                               \
236                 X##p3 = rol64(X##p3, ROT##_1);                                \
237                 X##p3 ^= X##p2;                                               \
238                 X##p4 += X##p5;                                               \
239                 X##p5 = rol64(X##p5, ROT##_2);                                \
240                 X##p5 ^= X##p4;                                               \
241                 X##p6 += X##p7;                                               \
242                 X##p7 = rol64(X##p7, ROT##_3);                                \
243                 X##p7 ^= X##p6;                                               \
244                 X##p8 += X##p9;                                               \
245                 X##p9 = rol64(X##p9, ROT##_4);                                \
246                 X##p9 ^= X##p8;                                               \
247                 X##pA += X##pB;                                               \
248                 X##pB = rol64(X##pB, ROT##_5);                                \
249                 X##pB ^= X##pA;                                               \
250                 X##pC += X##pD;                                               \
251                 X##pD = rol64(X##pD, ROT##_6);                                \
252                 X##pD ^= X##pC;                                               \
253                 X##pE += X##pF;                                               \
254                 X##pF = rol64(X##pF, ROT##_7);                                \
255                 X##pF ^= X##pE;                                               \
256         } while (0)
257
258 #if SKEIN_UNROLL_1024 == 0
259 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, \
260               ROT, rn)                                                        \
261         ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \
262                   pF, ROT, rn)                                                \
263
264 #define I1024(R)                                                \
265         do {                                                    \
266                 /* inject the key schedule value */             \
267                 X00 += ks[((R) + 1) % 17];                      \
268                 X01 += ks[((R) + 2) % 17];                      \
269                 X02 += ks[((R) + 3) % 17];                      \
270                 X03 += ks[((R) + 4) % 17];                      \
271                 X04 += ks[((R) + 5) % 17];                      \
272                 X05 += ks[((R) + 6) % 17];                      \
273                 X06 += ks[((R) + 7) % 17];                      \
274                 X07 += ks[((R) + 8) % 17];                      \
275                 X08 += ks[((R) + 9) % 17];                      \
276                 X09 += ks[((R) + 10) % 17];                     \
277                 X10 += ks[((R) + 11) % 17];                     \
278                 X11 += ks[((R) + 12) % 17];                     \
279                 X12 += ks[((R) + 13) % 17];                     \
280                 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \
281                 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \
282                 X15 += ks[((R) + 16) % 17] + (R) + 1;           \
283         } while (0)
284 #else /* looping version */
285 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, \
286               ROT, rn)                                                        \
287         ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \
288                   pF, ROT, rn)                                                \
289
290 #define I1024(R)                                                        \
291         do {                                                            \
292                 /* inject the key schedule value */                     \
293                 X00 += ks[r + (R) + 0];                                 \
294                 X01 += ks[r + (R) + 1];                                 \
295                 X02 += ks[r + (R) + 2];                                 \
296                 X03 += ks[r + (R) + 3];                                 \
297                 X04 += ks[r + (R) + 4];                                 \
298                 X05 += ks[r + (R) + 5];                                 \
299                 X06 += ks[r + (R) + 6];                                 \
300                 X07 += ks[r + (R) + 7];                                 \
301                 X08 += ks[r + (R) + 8];                                 \
302                 X09 += ks[r + (R) + 9];                                 \
303                 X10 += ks[r + (R) + 10];                                \
304                 X11 += ks[r + (R) + 11];                                \
305                 X12 += ks[r + (R) + 12];                                \
306                 X13 += ks[r + (R) + 13] + ts[r + (R) + 0];              \
307                 X14 += ks[r + (R) + 14] + ts[r + (R) + 1];              \
308                 X15 += ks[r + (R) + 15] + r + (R);                      \
309                 /* rotate key schedule */                               \
310                 ks[r + (R) + 16] = ks[r + (R) - 1];                     \
311                 ts[r + (R) + 2] = ts[r + (R) - 1];                      \
312         } while (0)
313
314 #endif
315 #define R1024_8_ROUNDS(R)                                                 \
316         do {                                                              \
317                 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, \
318                       13, 14, 15, R1024_0, 8 * (R) + 1);                  \
319                 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, \
320                       05, 08, 01, R1024_1, 8 * (R) + 2);                  \
321                 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, \
322                       11, 10, 09, R1024_2, 8 * (R) + 3);                  \
323                 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, \
324                       03, 12, 07, R1024_3, 8 * (R) + 4);                  \
325                 I1024(2 * (R));                                           \
326                 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, \
327                       13, 14, 15, R1024_4, 8 * (R) + 5);                  \
328                 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, \
329                       05, 08, 01, R1024_5, 8 * (R) + 6);                  \
330                 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, \
331                       11, 10, 09, R1024_6, 8 * (R) + 7);                  \
332                 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, \
333                       03, 12, 07, R1024_7, 8 * (R) + 8);                  \
334                 I1024(2 * (R) + 1);                                       \
335         } while (0)
336
337 #define R1024_UNROLL_R(NN)                              \
338                 ((SKEIN_UNROLL_1024 == 0 &&             \
339                 SKEIN_1024_ROUNDS_TOTAL / 8 > (NN)) ||  \
340                 (SKEIN_UNROLL_1024 > (NN)))
341
342 #if  (SKEIN_UNROLL_1024 > 14)
343 #error  "need more unrolling in Skein_1024_Process_Block"
344 #endif
345 #endif
346
347 /*****************************  SKEIN_256 ******************************/
348 #if !(SKEIN_USE_ASM & 256)
349 void skein_256_process_block(struct skein_256_ctx *ctx, const u8 *blk_ptr,
350                              size_t blk_cnt, size_t byte_cnt_add)
351 { /* do it in C */
352         enum {
353                 WCNT = SKEIN_256_STATE_WORDS
354         };
355         size_t r;
356 #if SKEIN_UNROLL_256
357         /* key schedule: chaining vars + tweak + "rot"*/
358         u64  kw[WCNT + 4 + (RCNT * 2)];
359 #else
360         /* key schedule words : chaining vars + tweak */
361         u64  kw[WCNT + 4];
362 #endif
363         u64  X0, X1, X2, X3; /* local copy of context vars, for speed */
364         u64  w[WCNT]; /* local copy of input block */
365 #ifdef SKEIN_DEBUG
366         const u64 *X_ptr[4]; /* use for debugging (help cc put Xn in regs) */
367
368         X_ptr[0] = &X0;
369         X_ptr[1] = &X1;
370         X_ptr[2] = &X2;
371         X_ptr[3] = &X3;
372 #endif
373         skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
374         ts[0] = ctx->h.tweak[0];
375         ts[1] = ctx->h.tweak[1];
376         do  {
377                 /*
378                  * this implementation only supports 2**64 input bytes
379                  * (no carry out here)
380                  */
381                 ts[0] += byte_cnt_add; /* update processed length */
382
383                 /* precompute the key schedule for this block */
384                 ks[0] = ctx->x[0];
385                 ks[1] = ctx->x[1];
386                 ks[2] = ctx->x[2];
387                 ks[3] = ctx->x[3];
388                 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
389
390                 ts[2] = ts[0] ^ ts[1];
391
392                 /* get input block in little-endian format */
393                 skein_get64_lsb_first(w, blk_ptr, WCNT);
394                 debug_save_tweak(ctx);
395
396                 /* do the first full key injection */
397                 X0 = w[0] + ks[0];
398                 X1 = w[1] + ks[1] + ts[0];
399                 X2 = w[2] + ks[2] + ts[1];
400                 X3 = w[3] + ks[3];
401
402                 blk_ptr += SKEIN_256_BLOCK_BYTES;
403
404                 /* run the rounds */
405                 for (r = 1;
406                         r < (SKEIN_UNROLL_256 ? 2 * RCNT : 2);
407                         r += (SKEIN_UNROLL_256 ? 2 * SKEIN_UNROLL_256 : 1)) {
408                         R256_8_ROUNDS(0);
409 #if   R256_UNROLL_R(1)
410                         R256_8_ROUNDS(1);
411 #endif
412 #if   R256_UNROLL_R(2)
413                         R256_8_ROUNDS(2);
414 #endif
415 #if   R256_UNROLL_R(3)
416                         R256_8_ROUNDS(3);
417 #endif
418 #if   R256_UNROLL_R(4)
419                         R256_8_ROUNDS(4);
420 #endif
421 #if   R256_UNROLL_R(5)
422                         R256_8_ROUNDS(5);
423 #endif
424 #if   R256_UNROLL_R(6)
425                         R256_8_ROUNDS(6);
426 #endif
427 #if   R256_UNROLL_R(7)
428                         R256_8_ROUNDS(7);
429 #endif
430 #if   R256_UNROLL_R(8)
431                         R256_8_ROUNDS(8);
432 #endif
433 #if   R256_UNROLL_R(9)
434                         R256_8_ROUNDS(9);
435 #endif
436 #if   R256_UNROLL_R(10)
437                         R256_8_ROUNDS(10);
438 #endif
439 #if   R256_UNROLL_R(11)
440                         R256_8_ROUNDS(11);
441 #endif
442 #if   R256_UNROLL_R(12)
443                         R256_8_ROUNDS(12);
444 #endif
445 #if   R256_UNROLL_R(13)
446                         R256_8_ROUNDS(13);
447 #endif
448 #if   R256_UNROLL_R(14)
449                         R256_8_ROUNDS(14);
450 #endif
451                 }
452                 /* do the final "feedforward" xor, update context chaining */
453                 ctx->x[0] = X0 ^ w[0];
454                 ctx->x[1] = X1 ^ w[1];
455                 ctx->x[2] = X2 ^ w[2];
456                 ctx->x[3] = X3 ^ w[3];
457
458                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
459         } while (--blk_cnt);
460         ctx->h.tweak[0] = ts[0];
461         ctx->h.tweak[1] = ts[1];
462 }
463
464 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
465 size_t skein_256_process_block_code_size(void)
466 {
467         return ((u8 *)skein_256_process_block_code_size) -
468                 ((u8 *)skein_256_process_block);
469 }
470
471 unsigned int skein_256_unroll_cnt(void)
472 {
473         return SKEIN_UNROLL_256;
474 }
475 #endif
476 #endif
477
478 /*****************************  SKEIN_512 ******************************/
479 #if !(SKEIN_USE_ASM & 512)
480 void skein_512_process_block(struct skein_512_ctx *ctx, const u8 *blk_ptr,
481                              size_t blk_cnt, size_t byte_cnt_add)
482 { /* do it in C */
483         enum {
484                 WCNT = SKEIN_512_STATE_WORDS
485         };
486         size_t  r;
487 #if SKEIN_UNROLL_512
488         /* key sched: chaining vars + tweak + "rot"*/
489         u64  kw[WCNT + 4 + RCNT * 2];
490 #else
491         /* key schedule words : chaining vars + tweak */
492         u64  kw[WCNT + 4];
493 #endif
494         u64  X0, X1, X2, X3, X4, X5, X6, X7; /* local copies, for speed */
495         u64  w[WCNT]; /* local copy of input block */
496 #ifdef SKEIN_DEBUG
497         const u64 *X_ptr[8]; /* use for debugging (help cc put Xn in regs) */
498
499         X_ptr[0] = &X0;
500         X_ptr[1] = &X1;
501         X_ptr[2] = &X2;
502         X_ptr[3] = &X3;
503         X_ptr[4] = &X4;
504         X_ptr[5] = &X5;
505         X_ptr[6] = &X6;
506         X_ptr[7] = &X7;
507 #endif
508
509         skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
510         ts[0] = ctx->h.tweak[0];
511         ts[1] = ctx->h.tweak[1];
512         do  {
513                 /*
514                  * this implementation only supports 2**64 input bytes
515                  * (no carry out here)
516                  */
517                 ts[0] += byte_cnt_add; /* update processed length */
518
519                 /* precompute the key schedule for this block */
520                 ks[0] = ctx->x[0];
521                 ks[1] = ctx->x[1];
522                 ks[2] = ctx->x[2];
523                 ks[3] = ctx->x[3];
524                 ks[4] = ctx->x[4];
525                 ks[5] = ctx->x[5];
526                 ks[6] = ctx->x[6];
527                 ks[7] = ctx->x[7];
528                 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
529                         ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
530
531                 ts[2] = ts[0] ^ ts[1];
532
533                 /* get input block in little-endian format */
534                 skein_get64_lsb_first(w, blk_ptr, WCNT);
535                 debug_save_tweak(ctx);
536
537                 /* do the first full key injection */
538                 X0 = w[0] + ks[0];
539                 X1 = w[1] + ks[1];
540                 X2 = w[2] + ks[2];
541                 X3 = w[3] + ks[3];
542                 X4 = w[4] + ks[4];
543                 X5 = w[5] + ks[5] + ts[0];
544                 X6 = w[6] + ks[6] + ts[1];
545                 X7 = w[7] + ks[7];
546
547                 blk_ptr += SKEIN_512_BLOCK_BYTES;
548
549                 /* run the rounds */
550                 for (r = 1;
551                         r < (SKEIN_UNROLL_512 ? 2 * RCNT : 2);
552                         r += (SKEIN_UNROLL_512 ? 2 * SKEIN_UNROLL_512 : 1)) {
553                         R512_8_ROUNDS(0);
554
555 #if   R512_UNROLL_R(1)
556                         R512_8_ROUNDS(1);
557 #endif
558 #if   R512_UNROLL_R(2)
559                         R512_8_ROUNDS(2);
560 #endif
561 #if   R512_UNROLL_R(3)
562                         R512_8_ROUNDS(3);
563 #endif
564 #if   R512_UNROLL_R(4)
565                         R512_8_ROUNDS(4);
566 #endif
567 #if   R512_UNROLL_R(5)
568                         R512_8_ROUNDS(5);
569 #endif
570 #if   R512_UNROLL_R(6)
571                         R512_8_ROUNDS(6);
572 #endif
573 #if   R512_UNROLL_R(7)
574                         R512_8_ROUNDS(7);
575 #endif
576 #if   R512_UNROLL_R(8)
577                         R512_8_ROUNDS(8);
578 #endif
579 #if   R512_UNROLL_R(9)
580                         R512_8_ROUNDS(9);
581 #endif
582 #if   R512_UNROLL_R(10)
583                         R512_8_ROUNDS(10);
584 #endif
585 #if   R512_UNROLL_R(11)
586                         R512_8_ROUNDS(11);
587 #endif
588 #if   R512_UNROLL_R(12)
589                         R512_8_ROUNDS(12);
590 #endif
591 #if   R512_UNROLL_R(13)
592                         R512_8_ROUNDS(13);
593 #endif
594 #if   R512_UNROLL_R(14)
595                         R512_8_ROUNDS(14);
596 #endif
597                 }
598
599                 /* do the final "feedforward" xor, update context chaining */
600                 ctx->x[0] = X0 ^ w[0];
601                 ctx->x[1] = X1 ^ w[1];
602                 ctx->x[2] = X2 ^ w[2];
603                 ctx->x[3] = X3 ^ w[3];
604                 ctx->x[4] = X4 ^ w[4];
605                 ctx->x[5] = X5 ^ w[5];
606                 ctx->x[6] = X6 ^ w[6];
607                 ctx->x[7] = X7 ^ w[7];
608
609                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
610         } while (--blk_cnt);
611         ctx->h.tweak[0] = ts[0];
612         ctx->h.tweak[1] = ts[1];
613 }
614
615 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
616 size_t skein_512_process_block_code_size(void)
617 {
618         return ((u8 *)skein_512_process_block_code_size) -
619                 ((u8 *)skein_512_process_block);
620 }
621
622 unsigned int skein_512_unroll_cnt(void)
623 {
624         return SKEIN_UNROLL_512;
625 }
626 #endif
627 #endif
628
629 /*****************************  SKEIN_1024 ******************************/
630 #if !(SKEIN_USE_ASM & 1024)
631 void skein_1024_process_block(struct skein_1024_ctx *ctx, const u8 *blk_ptr,
632                               size_t blk_cnt, size_t byte_cnt_add)
633 { /* do it in C, always looping (unrolled is bigger AND slower!) */
634         enum {
635                 WCNT = SKEIN_1024_STATE_WORDS
636         };
637         size_t  r;
638 #if (SKEIN_UNROLL_1024 != 0)
639         /* key sched: chaining vars + tweak + "rot" */
640         u64  kw[WCNT + 4 + (RCNT * 2)];
641 #else
642         /* key schedule words : chaining vars + tweak */
643         u64  kw[WCNT + 4];
644 #endif
645
646         /* local copy of vars, for speed */
647         u64  X00, X01, X02, X03, X04, X05, X06, X07,
648              X08, X09, X10, X11, X12, X13, X14, X15;
649         u64  w[WCNT]; /* local copy of input block */
650
651         skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */
652         ts[0] = ctx->h.tweak[0];
653         ts[1] = ctx->h.tweak[1];
654         do  {
655                 /*
656                  * this implementation only supports 2**64 input bytes
657                  * (no carry out here)
658                  */
659                 ts[0] += byte_cnt_add; /* update processed length */
660
661                 /* precompute the key schedule for this block */
662                 ks[0]  = ctx->x[0];
663                 ks[1]  = ctx->x[1];
664                 ks[2]  = ctx->x[2];
665                 ks[3]  = ctx->x[3];
666                 ks[4]  = ctx->x[4];
667                 ks[5]  = ctx->x[5];
668                 ks[6]  = ctx->x[6];
669                 ks[7]  = ctx->x[7];
670                 ks[8]  = ctx->x[8];
671                 ks[9]  = ctx->x[9];
672                 ks[10] = ctx->x[10];
673                 ks[11] = ctx->x[11];
674                 ks[12] = ctx->x[12];
675                 ks[13] = ctx->x[13];
676                 ks[14] = ctx->x[14];
677                 ks[15] = ctx->x[15];
678                 ks[16] =  ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
679                           ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
680                           ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
681                           ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
682
683                 ts[2] = ts[0] ^ ts[1];
684
685                 /* get input block in little-endian format */
686                 skein_get64_lsb_first(w, blk_ptr, WCNT);
687                 debug_save_tweak(ctx);
688
689                 /* do the first full key injection */
690                 X00 = w[0] + ks[0];
691                 X01 = w[1] + ks[1];
692                 X02 = w[2] + ks[2];
693                 X03 = w[3] + ks[3];
694                 X04 = w[4] + ks[4];
695                 X05 = w[5] + ks[5];
696                 X06 = w[6] + ks[6];
697                 X07 = w[7] + ks[7];
698                 X08 = w[8] + ks[8];
699                 X09 = w[9] + ks[9];
700                 X10 = w[10] + ks[10];
701                 X11 = w[11] + ks[11];
702                 X12 = w[12] + ks[12];
703                 X13 = w[13] + ks[13] + ts[0];
704                 X14 = w[14] + ks[14] + ts[1];
705                 X15 = w[15] + ks[15];
706
707                 for (r = 1;
708                         r < (SKEIN_UNROLL_1024 ? 2 * RCNT : 2);
709                         r += (SKEIN_UNROLL_1024 ? 2 * SKEIN_UNROLL_1024 : 1)) {
710                         R1024_8_ROUNDS(0);
711 #if   R1024_UNROLL_R(1)
712                         R1024_8_ROUNDS(1);
713 #endif
714 #if   R1024_UNROLL_R(2)
715                         R1024_8_ROUNDS(2);
716 #endif
717 #if   R1024_UNROLL_R(3)
718                         R1024_8_ROUNDS(3);
719 #endif
720 #if   R1024_UNROLL_R(4)
721                         R1024_8_ROUNDS(4);
722 #endif
723 #if   R1024_UNROLL_R(5)
724                         R1024_8_ROUNDS(5);
725 #endif
726 #if   R1024_UNROLL_R(6)
727                         R1024_8_ROUNDS(6);
728 #endif
729 #if   R1024_UNROLL_R(7)
730                         R1024_8_ROUNDS(7);
731 #endif
732 #if   R1024_UNROLL_R(8)
733                         R1024_8_ROUNDS(8);
734 #endif
735 #if   R1024_UNROLL_R(9)
736                         R1024_8_ROUNDS(9);
737 #endif
738 #if   R1024_UNROLL_R(10)
739                         R1024_8_ROUNDS(10);
740 #endif
741 #if   R1024_UNROLL_R(11)
742                         R1024_8_ROUNDS(11);
743 #endif
744 #if   R1024_UNROLL_R(12)
745                         R1024_8_ROUNDS(12);
746 #endif
747 #if   R1024_UNROLL_R(13)
748                         R1024_8_ROUNDS(13);
749 #endif
750 #if   R1024_UNROLL_R(14)
751                         R1024_8_ROUNDS(14);
752 #endif
753                 }
754                 /* do the final "feedforward" xor, update context chaining */
755
756                 ctx->x[0] = X00 ^ w[0];
757                 ctx->x[1] = X01 ^ w[1];
758                 ctx->x[2] = X02 ^ w[2];
759                 ctx->x[3] = X03 ^ w[3];
760                 ctx->x[4] = X04 ^ w[4];
761                 ctx->x[5] = X05 ^ w[5];
762                 ctx->x[6] = X06 ^ w[6];
763                 ctx->x[7] = X07 ^ w[7];
764                 ctx->x[8] = X08 ^ w[8];
765                 ctx->x[9] = X09 ^ w[9];
766                 ctx->x[10] = X10 ^ w[10];
767                 ctx->x[11] = X11 ^ w[11];
768                 ctx->x[12] = X12 ^ w[12];
769                 ctx->x[13] = X13 ^ w[13];
770                 ctx->x[14] = X14 ^ w[14];
771                 ctx->x[15] = X15 ^ w[15];
772
773                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
774                 blk_ptr += SKEIN_1024_BLOCK_BYTES;
775         } while (--blk_cnt);
776         ctx->h.tweak[0] = ts[0];
777         ctx->h.tweak[1] = ts[1];
778 }
779
780 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
781 size_t skein_1024_process_block_code_size(void)
782 {
783         return ((u8 *)skein_1024_process_block_code_size) -
784                 ((u8 *)skein_1024_process_block);
785 }
786
787 unsigned int skein_1024_unroll_cnt(void)
788 {
789         return SKEIN_UNROLL_1024;
790 }
791 #endif
792 #endif