4 * Copyright (c) 2013-2019
5 * Frank Denis <j at pureftpd dot org>
7 * Permission to use, copy, modify, and/or distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 memset(&h[0], 0, 10 * sizeof h[0]);
42 memset(&h[2], 0, 8 * sizeof h[0]);
47 Can overlap h with f or g.
50 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
51 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
54 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
58 fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
60 int32_t h0 = f[0] + g[0];
61 int32_t h1 = f[1] + g[1];
62 int32_t h2 = f[2] + g[2];
63 int32_t h3 = f[3] + g[3];
64 int32_t h4 = f[4] + g[4];
65 int32_t h5 = f[5] + g[5];
66 int32_t h6 = f[6] + g[6];
67 int32_t h7 = f[7] + g[7];
68 int32_t h8 = f[8] + g[8];
69 int32_t h9 = f[9] + g[9];
85 Can overlap h with f or g.
88 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
89 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
92 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
96 fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
98 int32_t h0 = f[0] - g[0];
99 int32_t h1 = f[1] - g[1];
100 int32_t h2 = f[2] - g[2];
101 int32_t h3 = f[3] - g[3];
102 int32_t h4 = f[4] - g[4];
103 int32_t h5 = f[5] - g[5];
104 int32_t h6 = f[6] - g[6];
105 int32_t h7 = f[7] - g[7];
106 int32_t h8 = f[8] - g[8];
107 int32_t h9 = f[9] - g[9];
125 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
128 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
132 fe25519_neg(fe25519 h, const fe25519 f)
158 Replace (f,g) with (g,g) if b == 1;
159 replace (f,g) with (f,g) if b == 0.
161 Preconditions: b in {0,1}.
165 fe25519_cmov(fe25519 f, const fe25519 g, unsigned int b)
167 const uint32_t mask = (uint32_t) (-(int32_t) b);
180 int32_t x0 = f0 ^ g[0];
181 int32_t x1 = f1 ^ g[1];
182 int32_t x2 = f2 ^ g[2];
183 int32_t x3 = f3 ^ g[3];
184 int32_t x4 = f4 ^ g[4];
185 int32_t x5 = f5 ^ g[5];
186 int32_t x6 = f6 ^ g[6];
187 int32_t x7 = f7 ^ g[7];
188 int32_t x8 = f8 ^ g[8];
189 int32_t x9 = f9 ^ g[9];
215 fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
217 const uint32_t mask = (uint32_t) (-(int64_t) b);
241 int32_t x0 = f0 ^ g0;
242 int32_t x1 = f1 ^ g1;
243 int32_t x2 = f2 ^ g2;
244 int32_t x3 = f3 ^ g3;
245 int32_t x4 = f4 ^ g4;
246 int32_t x5 = f5 ^ g5;
247 int32_t x6 = f6 ^ g6;
248 int32_t x7 = f7 ^ g7;
249 int32_t x8 = f8 ^ g8;
250 int32_t x9 = f9 ^ g9;
291 fe25519_copy(fe25519 h, const fe25519 f)
317 return 1 if f is in {1,3,5,...,q-2}
318 return 0 if f is in {0,2,4,...,q-1}
321 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
325 fe25519_isnegative(const fe25519 f)
329 fe25519_tobytes(s, f);
339 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
343 fe25519_iszero(const fe25519 f)
347 fe25519_tobytes(s, f);
349 return sodium_is_zero(s, 32);
354 Can overlap h with f or g.
357 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
358 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
361 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
365 Notes on implementation strategy:
367 Using schoolbook multiplication.
368 Karatsuba would save a little in some cost models.
370 Most multiplications by 2 and 19 are 32-bit precomputations;
371 cheaper than 64-bit postcomputations.
373 There is one remaining multiplication by 19 in the carry chain;
374 one *19 precomputation can be merged into this,
375 but the resulting data flow is considerably less clean.
377 There are 12 carries below.
378 10 of them are 2-way parallelizable and vectorizable.
379 Can get away with 11 carries, but then data flow is much deeper.
381 With tighter constraints on inputs can squeeze carries into int32.
385 fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
409 int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
410 int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
411 int32_t g3_19 = 19 * g3;
412 int32_t g4_19 = 19 * g4;
413 int32_t g5_19 = 19 * g5;
414 int32_t g6_19 = 19 * g6;
415 int32_t g7_19 = 19 * g7;
416 int32_t g8_19 = 19 * g8;
417 int32_t g9_19 = 19 * g9;
418 int32_t f1_2 = 2 * f1;
419 int32_t f3_2 = 2 * f3;
420 int32_t f5_2 = 2 * f5;
421 int32_t f7_2 = 2 * f7;
422 int32_t f9_2 = 2 * f9;
424 int64_t f0g0 = f0 * (int64_t) g0;
425 int64_t f0g1 = f0 * (int64_t) g1;
426 int64_t f0g2 = f0 * (int64_t) g2;
427 int64_t f0g3 = f0 * (int64_t) g3;
428 int64_t f0g4 = f0 * (int64_t) g4;
429 int64_t f0g5 = f0 * (int64_t) g5;
430 int64_t f0g6 = f0 * (int64_t) g6;
431 int64_t f0g7 = f0 * (int64_t) g7;
432 int64_t f0g8 = f0 * (int64_t) g8;
433 int64_t f0g9 = f0 * (int64_t) g9;
434 int64_t f1g0 = f1 * (int64_t) g0;
435 int64_t f1g1_2 = f1_2 * (int64_t) g1;
436 int64_t f1g2 = f1 * (int64_t) g2;
437 int64_t f1g3_2 = f1_2 * (int64_t) g3;
438 int64_t f1g4 = f1 * (int64_t) g4;
439 int64_t f1g5_2 = f1_2 * (int64_t) g5;
440 int64_t f1g6 = f1 * (int64_t) g6;
441 int64_t f1g7_2 = f1_2 * (int64_t) g7;
442 int64_t f1g8 = f1 * (int64_t) g8;
443 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
444 int64_t f2g0 = f2 * (int64_t) g0;
445 int64_t f2g1 = f2 * (int64_t) g1;
446 int64_t f2g2 = f2 * (int64_t) g2;
447 int64_t f2g3 = f2 * (int64_t) g3;
448 int64_t f2g4 = f2 * (int64_t) g4;
449 int64_t f2g5 = f2 * (int64_t) g5;
450 int64_t f2g6 = f2 * (int64_t) g6;
451 int64_t f2g7 = f2 * (int64_t) g7;
452 int64_t f2g8_19 = f2 * (int64_t) g8_19;
453 int64_t f2g9_19 = f2 * (int64_t) g9_19;
454 int64_t f3g0 = f3 * (int64_t) g0;
455 int64_t f3g1_2 = f3_2 * (int64_t) g1;
456 int64_t f3g2 = f3 * (int64_t) g2;
457 int64_t f3g3_2 = f3_2 * (int64_t) g3;
458 int64_t f3g4 = f3 * (int64_t) g4;
459 int64_t f3g5_2 = f3_2 * (int64_t) g5;
460 int64_t f3g6 = f3 * (int64_t) g6;
461 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
462 int64_t f3g8_19 = f3 * (int64_t) g8_19;
463 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
464 int64_t f4g0 = f4 * (int64_t) g0;
465 int64_t f4g1 = f4 * (int64_t) g1;
466 int64_t f4g2 = f4 * (int64_t) g2;
467 int64_t f4g3 = f4 * (int64_t) g3;
468 int64_t f4g4 = f4 * (int64_t) g4;
469 int64_t f4g5 = f4 * (int64_t) g5;
470 int64_t f4g6_19 = f4 * (int64_t) g6_19;
471 int64_t f4g7_19 = f4 * (int64_t) g7_19;
472 int64_t f4g8_19 = f4 * (int64_t) g8_19;
473 int64_t f4g9_19 = f4 * (int64_t) g9_19;
474 int64_t f5g0 = f5 * (int64_t) g0;
475 int64_t f5g1_2 = f5_2 * (int64_t) g1;
476 int64_t f5g2 = f5 * (int64_t) g2;
477 int64_t f5g3_2 = f5_2 * (int64_t) g3;
478 int64_t f5g4 = f5 * (int64_t) g4;
479 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
480 int64_t f5g6_19 = f5 * (int64_t) g6_19;
481 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
482 int64_t f5g8_19 = f5 * (int64_t) g8_19;
483 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
484 int64_t f6g0 = f6 * (int64_t) g0;
485 int64_t f6g1 = f6 * (int64_t) g1;
486 int64_t f6g2 = f6 * (int64_t) g2;
487 int64_t f6g3 = f6 * (int64_t) g3;
488 int64_t f6g4_19 = f6 * (int64_t) g4_19;
489 int64_t f6g5_19 = f6 * (int64_t) g5_19;
490 int64_t f6g6_19 = f6 * (int64_t) g6_19;
491 int64_t f6g7_19 = f6 * (int64_t) g7_19;
492 int64_t f6g8_19 = f6 * (int64_t) g8_19;
493 int64_t f6g9_19 = f6 * (int64_t) g9_19;
494 int64_t f7g0 = f7 * (int64_t) g0;
495 int64_t f7g1_2 = f7_2 * (int64_t) g1;
496 int64_t f7g2 = f7 * (int64_t) g2;
497 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
498 int64_t f7g4_19 = f7 * (int64_t) g4_19;
499 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
500 int64_t f7g6_19 = f7 * (int64_t) g6_19;
501 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
502 int64_t f7g8_19 = f7 * (int64_t) g8_19;
503 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
504 int64_t f8g0 = f8 * (int64_t) g0;
505 int64_t f8g1 = f8 * (int64_t) g1;
506 int64_t f8g2_19 = f8 * (int64_t) g2_19;
507 int64_t f8g3_19 = f8 * (int64_t) g3_19;
508 int64_t f8g4_19 = f8 * (int64_t) g4_19;
509 int64_t f8g5_19 = f8 * (int64_t) g5_19;
510 int64_t f8g6_19 = f8 * (int64_t) g6_19;
511 int64_t f8g7_19 = f8 * (int64_t) g7_19;
512 int64_t f8g8_19 = f8 * (int64_t) g8_19;
513 int64_t f8g9_19 = f8 * (int64_t) g9_19;
514 int64_t f9g0 = f9 * (int64_t) g0;
515 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
516 int64_t f9g2_19 = f9 * (int64_t) g2_19;
517 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
518 int64_t f9g4_19 = f9 * (int64_t) g4_19;
519 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
520 int64_t f9g6_19 = f9 * (int64_t) g6_19;
521 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
522 int64_t f9g8_19 = f9 * (int64_t) g8_19;
523 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
525 int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 +
526 f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
527 int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 +
528 f7g4_19 + f8g3_19 + f9g2_19;
529 int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 +
530 f7g5_38 + f8g4_19 + f9g3_38;
531 int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 +
532 f7g6_19 + f8g5_19 + f9g4_19;
533 int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 +
534 f7g7_38 + f8g6_19 + f9g5_38;
535 int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 +
537 int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 +
538 f7g9_38 + f8g8_19 + f9g7_38;
539 int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 +
541 int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 +
544 f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0;
558 |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
559 i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
560 |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
561 i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
564 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
566 h0 -= carry0 * ((uint64_t) 1L << 26);
567 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
569 h4 -= carry4 * ((uint64_t) 1L << 26);
572 /* |h1| <= 1.71*2^59 */
573 /* |h5| <= 1.71*2^59 */
575 carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
577 h1 -= carry1 * ((uint64_t) 1L << 25);
578 carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
580 h5 -= carry5 * ((uint64_t) 1L << 25);
581 /* |h1| <= 2^24; from now on fits into int32 */
582 /* |h5| <= 2^24; from now on fits into int32 */
583 /* |h2| <= 1.41*2^60 */
584 /* |h6| <= 1.41*2^60 */
586 carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
588 h2 -= carry2 * ((uint64_t) 1L << 26);
589 carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
591 h6 -= carry6 * ((uint64_t) 1L << 26);
592 /* |h2| <= 2^25; from now on fits into int32 unchanged */
593 /* |h6| <= 2^25; from now on fits into int32 unchanged */
594 /* |h3| <= 1.71*2^59 */
595 /* |h7| <= 1.71*2^59 */
597 carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
599 h3 -= carry3 * ((uint64_t) 1L << 25);
600 carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
602 h7 -= carry7 * ((uint64_t) 1L << 25);
603 /* |h3| <= 2^24; from now on fits into int32 unchanged */
604 /* |h7| <= 2^24; from now on fits into int32 unchanged */
605 /* |h4| <= 1.72*2^34 */
606 /* |h8| <= 1.41*2^60 */
608 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
610 h4 -= carry4 * ((uint64_t) 1L << 26);
611 carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
613 h8 -= carry8 * ((uint64_t) 1L << 26);
614 /* |h4| <= 2^25; from now on fits into int32 unchanged */
615 /* |h8| <= 2^25; from now on fits into int32 unchanged */
616 /* |h5| <= 1.01*2^24 */
617 /* |h9| <= 1.71*2^59 */
619 carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
621 h9 -= carry9 * ((uint64_t) 1L << 25);
622 /* |h9| <= 2^24; from now on fits into int32 unchanged */
623 /* |h0| <= 1.1*2^39 */
625 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
627 h0 -= carry0 * ((uint64_t) 1L << 26);
628 /* |h0| <= 2^25; from now on fits into int32 unchanged */
629 /* |h1| <= 1.01*2^24 */
645 Can overlap h with f.
648 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
651 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
655 fe25519_sq(fe25519 h, const fe25519 f)
668 int32_t f0_2 = 2 * f0;
669 int32_t f1_2 = 2 * f1;
670 int32_t f2_2 = 2 * f2;
671 int32_t f3_2 = 2 * f3;
672 int32_t f4_2 = 2 * f4;
673 int32_t f5_2 = 2 * f5;
674 int32_t f6_2 = 2 * f6;
675 int32_t f7_2 = 2 * f7;
676 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
677 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
678 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
679 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
680 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
682 int64_t f0f0 = f0 * (int64_t) f0;
683 int64_t f0f1_2 = f0_2 * (int64_t) f1;
684 int64_t f0f2_2 = f0_2 * (int64_t) f2;
685 int64_t f0f3_2 = f0_2 * (int64_t) f3;
686 int64_t f0f4_2 = f0_2 * (int64_t) f4;
687 int64_t f0f5_2 = f0_2 * (int64_t) f5;
688 int64_t f0f6_2 = f0_2 * (int64_t) f6;
689 int64_t f0f7_2 = f0_2 * (int64_t) f7;
690 int64_t f0f8_2 = f0_2 * (int64_t) f8;
691 int64_t f0f9_2 = f0_2 * (int64_t) f9;
692 int64_t f1f1_2 = f1_2 * (int64_t) f1;
693 int64_t f1f2_2 = f1_2 * (int64_t) f2;
694 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
695 int64_t f1f4_2 = f1_2 * (int64_t) f4;
696 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
697 int64_t f1f6_2 = f1_2 * (int64_t) f6;
698 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
699 int64_t f1f8_2 = f1_2 * (int64_t) f8;
700 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
701 int64_t f2f2 = f2 * (int64_t) f2;
702 int64_t f2f3_2 = f2_2 * (int64_t) f3;
703 int64_t f2f4_2 = f2_2 * (int64_t) f4;
704 int64_t f2f5_2 = f2_2 * (int64_t) f5;
705 int64_t f2f6_2 = f2_2 * (int64_t) f6;
706 int64_t f2f7_2 = f2_2 * (int64_t) f7;
707 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
708 int64_t f2f9_38 = f2 * (int64_t) f9_38;
709 int64_t f3f3_2 = f3_2 * (int64_t) f3;
710 int64_t f3f4_2 = f3_2 * (int64_t) f4;
711 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
712 int64_t f3f6_2 = f3_2 * (int64_t) f6;
713 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
714 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
715 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
716 int64_t f4f4 = f4 * (int64_t) f4;
717 int64_t f4f5_2 = f4_2 * (int64_t) f5;
718 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
719 int64_t f4f7_38 = f4 * (int64_t) f7_38;
720 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
721 int64_t f4f9_38 = f4 * (int64_t) f9_38;
722 int64_t f5f5_38 = f5 * (int64_t) f5_38;
723 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
724 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
725 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
726 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
727 int64_t f6f6_19 = f6 * (int64_t) f6_19;
728 int64_t f6f7_38 = f6 * (int64_t) f7_38;
729 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
730 int64_t f6f9_38 = f6 * (int64_t) f9_38;
731 int64_t f7f7_38 = f7 * (int64_t) f7_38;
732 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
733 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
734 int64_t f8f8_19 = f8 * (int64_t) f8_19;
735 int64_t f8f9_38 = f8 * (int64_t) f9_38;
736 int64_t f9f9_38 = f9 * (int64_t) f9_38;
738 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
739 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
740 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
741 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
742 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
743 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
744 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
745 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
746 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
747 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
760 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
762 h0 -= carry0 * ((uint64_t) 1L << 26);
763 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
765 h4 -= carry4 * ((uint64_t) 1L << 26);
767 carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
769 h1 -= carry1 * ((uint64_t) 1L << 25);
770 carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
772 h5 -= carry5 * ((uint64_t) 1L << 25);
774 carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
776 h2 -= carry2 * ((uint64_t) 1L << 26);
777 carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
779 h6 -= carry6 * ((uint64_t) 1L << 26);
781 carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
783 h3 -= carry3 * ((uint64_t) 1L << 25);
784 carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
786 h7 -= carry7 * ((uint64_t) 1L << 25);
788 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
790 h4 -= carry4 * ((uint64_t) 1L << 26);
791 carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
793 h8 -= carry8 * ((uint64_t) 1L << 26);
795 carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
797 h9 -= carry9 * ((uint64_t) 1L << 25);
799 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
801 h0 -= carry0 * ((uint64_t) 1L << 26);
817 Can overlap h with f.
820 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
823 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
827 fe25519_sq2(fe25519 h, const fe25519 f)
840 int32_t f0_2 = 2 * f0;
841 int32_t f1_2 = 2 * f1;
842 int32_t f2_2 = 2 * f2;
843 int32_t f3_2 = 2 * f3;
844 int32_t f4_2 = 2 * f4;
845 int32_t f5_2 = 2 * f5;
846 int32_t f6_2 = 2 * f6;
847 int32_t f7_2 = 2 * f7;
848 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
849 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
850 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
851 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
852 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
854 int64_t f0f0 = f0 * (int64_t) f0;
855 int64_t f0f1_2 = f0_2 * (int64_t) f1;
856 int64_t f0f2_2 = f0_2 * (int64_t) f2;
857 int64_t f0f3_2 = f0_2 * (int64_t) f3;
858 int64_t f0f4_2 = f0_2 * (int64_t) f4;
859 int64_t f0f5_2 = f0_2 * (int64_t) f5;
860 int64_t f0f6_2 = f0_2 * (int64_t) f6;
861 int64_t f0f7_2 = f0_2 * (int64_t) f7;
862 int64_t f0f8_2 = f0_2 * (int64_t) f8;
863 int64_t f0f9_2 = f0_2 * (int64_t) f9;
864 int64_t f1f1_2 = f1_2 * (int64_t) f1;
865 int64_t f1f2_2 = f1_2 * (int64_t) f2;
866 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
867 int64_t f1f4_2 = f1_2 * (int64_t) f4;
868 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
869 int64_t f1f6_2 = f1_2 * (int64_t) f6;
870 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
871 int64_t f1f8_2 = f1_2 * (int64_t) f8;
872 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
873 int64_t f2f2 = f2 * (int64_t) f2;
874 int64_t f2f3_2 = f2_2 * (int64_t) f3;
875 int64_t f2f4_2 = f2_2 * (int64_t) f4;
876 int64_t f2f5_2 = f2_2 * (int64_t) f5;
877 int64_t f2f6_2 = f2_2 * (int64_t) f6;
878 int64_t f2f7_2 = f2_2 * (int64_t) f7;
879 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
880 int64_t f2f9_38 = f2 * (int64_t) f9_38;
881 int64_t f3f3_2 = f3_2 * (int64_t) f3;
882 int64_t f3f4_2 = f3_2 * (int64_t) f4;
883 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
884 int64_t f3f6_2 = f3_2 * (int64_t) f6;
885 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
886 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
887 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
888 int64_t f4f4 = f4 * (int64_t) f4;
889 int64_t f4f5_2 = f4_2 * (int64_t) f5;
890 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
891 int64_t f4f7_38 = f4 * (int64_t) f7_38;
892 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
893 int64_t f4f9_38 = f4 * (int64_t) f9_38;
894 int64_t f5f5_38 = f5 * (int64_t) f5_38;
895 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
896 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
897 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
898 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
899 int64_t f6f6_19 = f6 * (int64_t) f6_19;
900 int64_t f6f7_38 = f6 * (int64_t) f7_38;
901 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
902 int64_t f6f9_38 = f6 * (int64_t) f9_38;
903 int64_t f7f7_38 = f7 * (int64_t) f7_38;
904 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
905 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
906 int64_t f8f8_19 = f8 * (int64_t) f8_19;
907 int64_t f8f9_38 = f8 * (int64_t) f9_38;
908 int64_t f9f9_38 = f9 * (int64_t) f9_38;
910 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
911 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
912 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
913 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
914 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
915 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
916 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
917 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
918 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
919 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
943 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
945 h0 -= carry0 * ((uint64_t) 1L << 26);
946 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
948 h4 -= carry4 * ((uint64_t) 1L << 26);
950 carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
952 h1 -= carry1 * ((uint64_t) 1L << 25);
953 carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
955 h5 -= carry5 * ((uint64_t) 1L << 25);
957 carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
959 h2 -= carry2 * ((uint64_t) 1L << 26);
960 carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
962 h6 -= carry6 * ((uint64_t) 1L << 26);
964 carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
966 h3 -= carry3 * ((uint64_t) 1L << 25);
967 carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
969 h7 -= carry7 * ((uint64_t) 1L << 25);
971 carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
973 h4 -= carry4 * ((uint64_t) 1L << 26);
974 carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
976 h8 -= carry8 * ((uint64_t) 1L << 26);
978 carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
980 h9 -= carry9 * ((uint64_t) 1L << 25);
982 carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
984 h0 -= carry0 * ((uint64_t) 1L << 26);
999 fe25519_scalar_product(fe25519 h, const fe25519 f, uint32_t n)
1001 int64_t sn = (int64_t) n;
1012 int64_t h0 = f0 * sn;
1013 int64_t h1 = f1 * sn;
1014 int64_t h2 = f2 * sn;
1015 int64_t h3 = f3 * sn;
1016 int64_t h4 = f4 * sn;
1017 int64_t h5 = f5 * sn;
1018 int64_t h6 = f6 * sn;
1019 int64_t h7 = f7 * sn;
1020 int64_t h8 = f8 * sn;
1021 int64_t h9 = f9 * sn;
1022 int64_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7,
1025 carry9 = (h9 + ((int64_t) 1 << 24)) >> 25;
1027 h9 -= carry9 * ((int64_t) 1 << 25);
1028 carry1 = (h1 + ((int64_t) 1 << 24)) >> 25;
1030 h1 -= carry1 * ((int64_t) 1 << 25);
1031 carry3 = (h3 + ((int64_t) 1 << 24)) >> 25;
1033 h3 -= carry3 * ((int64_t) 1 << 25);
1034 carry5 = (h5 + ((int64_t) 1 << 24)) >> 25;
1036 h5 -= carry5 * ((int64_t) 1 << 25);
1037 carry7 = (h7 + ((int64_t) 1 << 24)) >> 25;
1039 h7 -= carry7 * ((int64_t) 1 << 25);
1041 carry0 = (h0 + ((int64_t) 1 << 25)) >> 26;
1043 h0 -= carry0 * ((int64_t) 1 << 26);
1044 carry2 = (h2 + ((int64_t) 1 << 25)) >> 26;
1046 h2 -= carry2 * ((int64_t) 1 << 26);
1047 carry4 = (h4 + ((int64_t) 1 << 25)) >> 26;
1049 h4 -= carry4 * ((int64_t) 1 << 26);
1050 carry6 = (h6 + ((int64_t) 1 << 25)) >> 26;
1052 h6 -= carry6 * ((int64_t) 1 << 26);
1053 carry8 = (h8 + ((int64_t) 1 << 25)) >> 26;
1055 h8 -= carry8 * ((int64_t) 1 << 26);
1057 h[0] = (int32_t) h0;
1058 h[1] = (int32_t) h1;
1059 h[2] = (int32_t) h2;
1060 h[3] = (int32_t) h3;
1061 h[4] = (int32_t) h4;
1062 h[5] = (int32_t) h5;
1063 h[6] = (int32_t) h6;
1064 h[7] = (int32_t) h7;
1065 h[8] = (int32_t) h8;
1066 h[9] = (int32_t) h9;