include/linux/reciprocal_div.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _LINUX_RECIPROCAL_DIV_H
   3 #define _LINUX_RECIPROCAL_DIV_H
   4
   5 #include <linux/types.h>
   6
   7 /*
   8  * This algorithm is based on the paper "Division by Invariant
   9  * Integers Using Multiplication" by Torbjörn Granlund and Peter
  10  * L. Montgomery.
  11  *
  12  * The assembler implementation from Agner Fog, which this code is
  13  * based on, can be found here:
  14  * http://www.agner.org/optimize/asmlib.zip
  15  *
  16  * This optimization for A/B is helpful if the divisor B is mostly
  17  * runtime invariant. The reciprocal of B is calculated in the
  18  * slow-path with reciprocal_value(). The fast-path can then just use
  19  * a much faster multiplication operation with a variable dividend A
  20  * to calculate the division A/B.
  21  */
  22
  23 struct reciprocal_value {
  24         u32 m;
  25         u8 sh1, sh2;
  26 };
  27
  28 /* "reciprocal_value" and "reciprocal_divide" together implement the basic
  29  * version of the algorithm described in Figure 4.1 of the paper.
  30  */
  31 struct reciprocal_value reciprocal_value(u32 d);
  32
  33 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
  34 {
  35         u32 t = (u32)(((u64)a * R.m) >> 32);
  36         return (t + ((a - t) >> R.sh1)) >> R.sh2;
  37 }
  38
  39 struct reciprocal_value_adv {
  40         u32 m;
  41         u8 sh, exp;
  42         bool is_wide_m;
  43 };
  44
  45 /* "reciprocal_value_adv" implements the advanced version of the algorithm
  46  * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
  47  * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
  48  * exception case could be easily handled before calling "reciprocal_value_adv".
  49  *
  50  * The advanced version requires more complex calculation to get the reciprocal
  51  * multiplier and other control variables, but then could reduce the required
  52  * emulation operations.
  53  *
  54  * It makes no sense to use this advanced version for host divide emulation,
  55  * those extra complexities for calculating multiplier etc could completely
  56  * waive our saving on emulation operations.
  57  *
  58  * However, it makes sense to use it for JIT divide code generation for which
  59  * we are willing to trade performance of JITed code with that of host. As shown
  60  * by the following pseudo code, the required emulation operations could go down
  61  * from 6 (the basic version) to 3 or 4.
  62  *
  63  * To use the result of "reciprocal_value_adv", suppose we want to calculate
  64  * n/d, the pseudo C code will be:
  65  *
  66  *   struct reciprocal_value_adv rvalue;
  67  *   u8 pre_shift, exp;
  68  *
  69  *   // handle exception case.
  70  *   if (d >= (1U << 31)) {
  71  *     result = n >= d;
  72  *     return;
  73  *   }
  74  *
  75  *   rvalue = reciprocal_value_adv(d, 32)
  76  *   exp = rvalue.exp;
  77  *   if (rvalue.is_wide_m && !(d & 1)) {
  78  *     // floor(log2(d & (2^32 -d)))
  79  *     pre_shift = fls(d & -d) - 1;
  80  *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  81  *   } else {
  82  *     pre_shift = 0;
  83  *   }
  84  *
  85  *   // code generation starts.
  86  *   if (imm == 1U << exp) {
  87  *     result = n >> exp;
  88  *   } else if (rvalue.is_wide_m) {
  89  *     // pre_shift must be zero when reached here.
  90  *     t = (n * rvalue.m) >> 32;
  91  *     result = n - t;
  92  *     result >>= 1;
  93  *     result += t;
  94  *     result >>= rvalue.sh - 1;
  95  *   } else {
  96  *     if (pre_shift)
  97  *       result = n >> pre_shift;
  98  *     result = ((u64)result * rvalue.m) >> 32;
  99  *     result >>= rvalue.sh;
 100  *   }
 101  */
 102 struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
 103
 104 #endif /* _LINUX_RECIPROCAL_DIV_H */