3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
41 //==============================================================
42 // 2/02/00 Initial version
43 // 4/04/00 Unwind support added
44 // 8/15/00 Bundle added after call to __libm_error_support to properly
45 // set [the previously overwritten] GR_Parameter_RESULT.
47 // *********************************************************************
49 // Function: log1p(x) = ln(x+1), for double precision x values
51 // *********************************************************************
53 // Accuracy: Very accurate for double precision values
55 // *********************************************************************
59 // Floating-Point Registers: f8 (Input and Return Value)
62 // General Purpose Registers:
64 // r54-r57 (Used to pass arguments to error handling routine)
66 // Predicate Registers: p6-p15
68 // *********************************************************************
70 // IEEE Special Conditions:
72 // Denormal fault raised on denormal inputs
73 // Overflow exceptions cannot occur
74 // Underflow exceptions raised when appropriate for log1p
75 // (Error Handling Routine called for underflow)
76 // Inexact raised when appropriate by algorithm
84 // log1p(EM_special Values) = QNaN
86 // *********************************************************************
88 // Computation is based on the following kernel.
90 // ker_log_64( in_FR : X,
93 // in_GR : Expo_Range,
101 // The method consists of three cases.
103 // If |X+Em1| < 2^(-80) use case log1p_small;
104 // elseif |X+Em1| < 2^(-7) use case log_near1;
105 // else use case log_regular;
109 // log( 1 + (X+Em1) ) can be approximated by (X+Em1).
113 // log( 1 + (X+Em1) ) can be approximated by a simple polynomial
114 // in W = X+Em1. This polynomial resembles the truncated Taylor
115 // series W - W^/2 + W^3/3 - ...
119 // Here we use a table lookup method. The basic idea is that in
120 // order to compute log(Arg) for an argument Arg in [1,2), we
121 // construct a value G such that G*Arg is close to 1 and that
122 // log(1/G) is obtainable easily from a table of values calculated
125 // log(Arg) = log(1/G) + log(G*Arg)
126 // = log(1/G) + log(1 + (G*Arg - 1))
128 // Because |G*Arg - 1| is small, the second term on the right hand
129 // side can be approximated by a short polynomial. We elaborate
130 // this method in four steps.
132 // Step 0: Initialization
134 // We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that
136 // E + X = 2^N * ( S_hi + S_lo ) exactly
138 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
139 // that |S_lo| <= ulp(S_hi).
141 // Step 1: Argument Reduction
143 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
145 // G := G_1 * G_2 * G_3
146 // r := (G * S_hi - 1) + G * S_lo
148 // These G_j's have the property that the product is exactly
149 // representable and that |r| < 2^(-12) as a result.
151 // Step 2: Approximation
154 // log(1 + r) is approximated by a short polynomial poly(r).
156 // Step 3: Reconstruction
159 // Finally, log( E + X ) is given by
161 // log( E + X ) = log( 2^N * (S_hi + S_lo) )
162 // ~=~ N*log(2) + log(1/G) + log(1 + r)
163 // ~=~ N*log(2) + log(1/G) + poly(r).
165 // **** Algorithm ****
169 // Although log(1 + (X+Em1)) is basically X+Em1, we would like to
170 // preserve the inexactness nature as well as consistent behavior
171 // under different rounding modes. Note that this case can only be
172 // taken if E is set to be 1.0. In this case, Em1 is zero, and that
173 // X can be very tiny and thus the final result can possibly underflow.
174 // Thus, we compare X against a threshold that is dependent on the
175 // input Expo_Range. If |X| is smaller than this threshold, we set
178 // The result is returned as Y_hi, Y_lo, and in the case of SAFE
179 // is FALSE, an additional value Scale is also returned.
182 // Threshold := Threshold_Table( Expo_Range )
183 // Tiny := Tiny_Table( Expo_Range )
185 // If ( |W| > Threshold ) then
196 // One may think that Y_lo should be -W*W/2; however, it does not matter
197 // as Y_lo will be rounded off completely except for the correct effect in
198 // directed rounding. Clearly -W*W is simplier to compute. Moreover,
199 // because of the difference in exponent value, Y_hi + Y_lo or
200 // Y_hi + Scale*Y_lo is always inexact.
204 // Here we compute a simple polynomial. To exploit parallelism, we split
205 // the polynomial into two portions.
211 // Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
212 // Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
213 // set lsb(Y_lo) to be 1
217 // We present the algorithm in four steps.
219 // Step 0. Initialization
220 // ----------------------
223 // N := unbaised exponent of Z
224 // S_hi := 2^(-N) * Z
225 // S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) }
227 // Note that S_lo is always 0 for the case E = 0.
229 // Step 1. Argument Reduction
230 // --------------------------
234 // Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
236 // We obtain G_1, G_2, G_3 by the following steps.
239 // Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
242 // Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
245 // Define index_1 := [ d_1 d_2 d_3 d_4 ].
247 // Fetch Z_1 := (1/A_1) rounded UP in fixed point with
248 // fixed point lsb = 2^(-15).
249 // Z_1 looks like z_0.z_1 z_2 ... z_15
250 // Note that the fetching is done using index_1.
251 // A_1 is actually not needed in the implementation
252 // and is used here only to explain how is the value
255 // Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
256 // floating pt. Again, fetching is done using index_1. A_1
257 // explains how G_1 is defined.
259 // Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
260 // = 1.0 0 0 0 d_5 ... d_14
261 // This is accomplised by integer multiplication.
262 // It is proved that X_1 indeed always begin
263 // with 1.0000 in fixed point.
266 // Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
267 // truncated to lsb = 2^(-8). Similar to A_1,
268 // A_2 is not needed in actual implementation. It
269 // helps explain how some of the values are defined.
271 // Define index_2 := [ d_5 d_6 d_7 d_8 ].
273 // Fetch Z_2 := (1/A_2) rounded UP in fixed point with
274 // fixed point lsb = 2^(-15). Fetch done using index_2.
275 // Z_2 looks like z_0.z_1 z_2 ... z_15
277 // Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
280 // Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
281 // = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
282 // This is accomplised by integer multiplication.
283 // It is proved that X_2 indeed always begin
284 // with 1.00000000 in fixed point.
287 // Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
288 // This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
290 // Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
292 // Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
293 // floating pt. Fetch is done using index_3.
295 // Compute G := G_1 * G_2 * G_3.
297 // This is done exactly since each of G_j only has 21 sig. bits.
301 // r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
303 // thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of
307 // Step 2. Approximation
308 // ---------------------
310 // This step computes an approximation to log( 1 + r ) where r is the
311 // reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
312 // thus log(1+r) can be approximated by a short polynomial:
314 // log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
317 // Step 3. Reconstruction
318 // ----------------------
320 // This step computes the desired result of log(X+E):
322 // log(X+E) = log( 2^N * (S_hi + S_lo) )
323 // = N*log(2) + log( S_hi + S_lo )
324 // = N*log(2) + log(1/G) +
325 // log(1 + C*(S_hi+S_lo) - 1 )
327 // log(2), log(1/G_j) are stored as pairs of (single,double) numbers:
328 // log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
329 // single-precision numbers and the low parts are double precision
330 // numbers. These have the property that
332 // N*log2_hi + SUM ( log1byGj_hi )
334 // is computable exactly in double-extended precision (64 sig. bits).
337 // Y_hi := N*log2_hi + SUM ( log1byGj_hi )
338 // Y_lo := poly_hi + [ poly_lo +
339 // ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
340 // set lsb(Y_lo) to be 1
343 #include "libm_support.h"
351 // P_7, P_6, P_5, P_4, P_3, P_2, and P_1
355 ASM_TYPE_DIRECTIVE(Constants_P,@object)
356 data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
357 data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
358 data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
359 data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
360 data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
361 data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
362 data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
363 data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
364 ASM_SIZE_DIRECTIVE(Constants_P)
366 // log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
370 ASM_TYPE_DIRECTIVE(Constants_Q,@object)
371 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
372 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
373 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
374 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
375 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
376 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
377 ASM_SIZE_DIRECTIVE(Constants_Q)
379 // Z1 - 16 bit fixed, G1 and H1 - IEEE single
383 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object)
384 data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
385 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6
386 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6
387 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF
388 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C
389 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C
390 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F
391 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B
392 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34
393 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E
394 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C
395 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3
396 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2
397 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895
398 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5
399 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874
400 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1)
402 // Z2 - 16 bit fixed, G2 and H2 - IEEE single
406 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object)
407 data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
408 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116
409 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF
410 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E
411 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0
412 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F
413 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791
414 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C
415 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156
416 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97
417 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483
418 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9
419 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06
420 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202
421 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4
422 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391
423 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2)
425 // G3 and H3 - IEEE single and h3 -IEEE double
429 ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object)
430 data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
431 data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
432 data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
433 data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
434 data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
435 data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
436 data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
437 data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
438 data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
439 data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
440 data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
441 data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
442 data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
443 data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
444 data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
445 data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
446 data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
447 data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
448 data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
449 data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
450 data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
451 data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
452 data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
453 data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
454 data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
455 data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
456 data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
457 data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
458 data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
459 data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
460 data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
461 data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
462 ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3)
465 // Exponent Thresholds and Tiny Thresholds
466 // for 8, 11, 15, and 17 bit exponents
470 // 0 (8 bits) 2^(-126)
471 // 1 (11 bits) 2^(-1022)
472 // 2 (15 bits) 2^(-16382)
473 // 3 (17 bits) 2^(-16382)
479 // 0 (8 bits) 2^(-16382)
480 // 1 (11 bits) 2^(-16382)
481 // 2 (15 bits) 2^(-16382)
482 // 3 (17 bits) 2^(-16382)
487 ASM_TYPE_DIRECTIVE(Constants_Threshold,@object)
488 data4 0x00000000,0x80000000,0x00003F81,0x00000000
489 data4 0x00000000,0x80000000,0x00000001,0x00000000
490 data4 0x00000000,0x80000000,0x00003C01,0x00000000
491 data4 0x00000000,0x80000000,0x00000001,0x00000000
492 data4 0x00000000,0x80000000,0x00000001,0x00000000
493 data4 0x00000000,0x80000000,0x00000001,0x00000000
494 data4 0x00000000,0x80000000,0x00000001,0x00000000
495 data4 0x00000000,0x80000000,0x00000001,0x00000000
496 ASM_SIZE_DIRECTIVE(Constants_Threshold)
500 ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object)
501 data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
502 data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000
503 ASM_SIZE_DIRECTIVE(Constants_1_by_LN10)
578 FR_Output_X_tmp = f99
606 GR_Parameter_RESULT = r56
608 GR_Parameter_TAG = r57
622 alloc r32 = ar.pfs,0,22,4,0
623 (p0) fsub.s1 FR_Neg_One = f0,f1
624 (p0) cmp.eq.unc p7, p0 = r0, r0
628 (p0) cmp.ne.unc p14, p0 = r0, r0
629 (p0) fnorm.s1 FR_X_Prime = FR_Input_X
630 (p0) cmp.eq.unc p15, p0 = r0, r0 ;;
635 (p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
642 (p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
649 (p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0
655 (p0) fadd FR_Em1 = f0,f0
661 (p0) fadd FR_E = f0,f1
667 (p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One
673 (p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One
682 (p0) fadd.s1 FR_Z = FR_X_Prime, FR_E
688 (p0) movl GR_Table_Scale = 0x0000000000000018 ;;
694 // Create E = 1 and Em1 = 0
695 // Check for X == 0, meaning log(1+0)
696 // Check for X < -1, meaning log(negative)
697 // Check for X == -1, meaning log(0)
699 // Identify NatVals, NaNs, Infs.
700 // Identify EM unsupporteds.
701 // Identify Negative values - us S1 so as
702 // not to raise denormal operand exception
703 // Set p15 to true for log1p
704 // Set p14 to false for log1p
705 // Set p7 true for log and log1p
707 (p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp
713 (p0) fmax.s1 FR_AA = FR_X_Prime, FR_E
718 ld8 GR_Table_Base = [GR_Table_Base]
719 (p0) fmin.s1 FR_BB = FR_X_Prime, FR_E
725 (p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1
727 // Begin load of constants base
728 // FR_Z = Z = |x| + E
729 // FR_W = W = |x| + Em1
733 (p6) br.cond.spnt L(LOG_64_special) ;;
739 (p10) br.cond.spnt L(LOG_64_unsupported) ;;
745 (p13) br.cond.spnt L(LOG_64_negative) ;;
749 (p0) getf.sig GR_signif = FR_Z
751 (p9) br.cond.spnt L(LOG_64_one) ;;
757 (p8) br.cond.spnt L(LOG_64_zero) ;;
761 (p0) getf.exp GR_N = FR_Z
763 // Raise possible denormal operand exception
766 // This function computes ln( x + e )
767 // Input FR 1: FR_X = FR_Input_X
768 // Input FR 2: FR_E = FR_E
769 // Input FR 3: FR_Em1 = FR_Em1
770 // Input GR 1: GR_Expo_Range = GR_Expo_Range = 1
771 // Output FR 4: FR_Y_hi
772 // Output FR 5: FR_Y_lo
773 // Output FR 6: FR_Scale
774 // Output PR 7: PR_Safe
776 (p0) fsub.s1 FR_S_lo = FR_AA, FR_Z
778 // signif = getf.sig(Z)
781 (p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;;
786 (p0) fmerge.se FR_S_hi = f1,FR_Z
787 (p0) extr.u GR_X_0 = GR_signif, 49, 15
792 (p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp
798 ld8 GR_Table_Base1 = [GR_Table_Base1]
799 (p0) movl GR_Bias = 0x000000000000FFFF ;;
804 (p0) fabs FR_abs_W = FR_W
805 (p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0
811 // Branch out for special input values
813 (p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0
820 // X_0 = extr.u(signif,49,15)
821 // Index1 = extr.u(signif,59,4)
823 (p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB
831 // Offset_to_Z1 = 24 * Index1
832 // For performance, don't use result
833 // for 3 or 4 cycles.
835 (p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;;
838 // Add Base to Offset for Z1
842 (p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;;
843 (p0) ldfs FR_G = [GR_Table_ptr],4
848 (p0) ldfs FR_H = [GR_Table_ptr],8 ;;
849 (p0) ldfd FR_h = [GR_Table_ptr],0
850 (p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
854 // Get Base of Table2
858 (p0) getf.exp GR_M = FR_abs_W
867 // M = getf.exp(abs_W)
869 // X_1 = pmpyshr2(X_0,Z_1,15)
871 (p0) sub GR_M = GR_M, GR_Bias ;;
880 (p0) cmp.gt.unc p11, p0 = -80, GR_M
881 (p0) cmp.gt.unc p12, p0 = -7, GR_M ;;
882 (p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
888 // if -80 > M, set p11
889 // Index2 = extr.u(X_1,6,4)
890 // if -7 > M, set p12
893 (p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0
894 (p11) br.cond.spnt L(log1p_small) ;;
900 (p12) br.cond.spnt L(log1p_near) ;;
904 (p0) sub GR_N = GR_N, GR_Bias
906 // poly_lo = r * poly_lo
908 (p0) add GR_Perturb = 0x1, r0 ;;
909 (p0) sub GR_ScaleN = GR_Bias, GR_N
913 (p0) setf.sig FR_float_N = GR_N
916 // Prepare Index2 - pmpyshr2.u(X_1,Z_2,15)
919 // Branch for -80 > M
921 (p0) add GR_Index2 = GR_Index2, GR_Table_Base1
925 (p0) setf.exp FR_two_negN = GR_ScaleN
927 (p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp
931 // Index2 points to Z2
936 (p0) ld4 GR_Z_2 = [GR_Index2],4
937 ld8 GR_Table_Base = [GR_Table_Base]
944 // Tablebase points to Table3
948 (p0) ldfs FR_G_tmp = [GR_Index2],4 ;;
951 // pmpyshr2 X_2= (X_1,Z_2,15)
952 // float_N = setf.sig(N)
955 (p0) ldfs FR_H_tmp = [GR_Index2],8
960 // two_negN = setf.exp(scaleN)
965 (p0) ldfd FR_h_tmp = [GR_Index2],0
967 (p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
972 (p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
977 // Index3 = extr.u(X_2,1,5)
979 (p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base
986 // float_N = fcvt.xf(float_N)
989 (p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;;
993 ld8 GR_Table_Base = [GR_Table_Base]
999 (p0) ldfe FR_log2_hi = [GR_Table_Base],16
1000 (p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN
1012 (p0) ldfe FR_log2_lo = [GR_Table_Base],16
1013 (p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;;
1017 (p0) ldfs FR_G_tmp = [GR_Index3],4
1023 (p0) ldfe FR_Q4 = [GR_Table_Base],16
1024 (p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;;
1028 (p0) ldfe FR_Q3 = [GR_Table_Base],16
1029 (p0) fadd.s1 FR_H = FR_H, FR_H_tmp
1034 (p0) ldfs FR_H_tmp = [GR_Index3],4
1035 (p0) ldfe FR_Q2 = [GR_Table_Base],16
1037 // Comput Index for Table3
1038 // S_lo = S_lo * two_negN
1040 (p0) fcvt.xf FR_float_N = FR_float_N ;;
1043 // If S_lo == 0, set p8 false
1045 // Load ptr to table of polynomial coeff.
1049 (p0) ldfd FR_h_tmp = [GR_Index3],0
1050 (p0) ldfe FR_Q1 = [GR_Table_Base],0
1051 (p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;;
1056 (p0) fmpy.s1 FR_G = FR_G, FR_G_tmp
1062 (p0) fadd.s1 FR_H = FR_H, FR_H_tmp
1068 (p0) fms.s1 FR_r = FR_G, FR_S_hi, f1
1074 (p0) fadd.s1 FR_h = FR_h, FR_h_tmp
1080 (p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H
1092 (p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r
1099 // poly_lo = r * Q4 + Q3
1102 (p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h
1109 // If (S_lo!=0) r = s_lo * G + r
1111 (p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
1115 // Create a 0x00000....01
1116 // poly_lo = poly_lo * rsq + h
1120 (p0) setf.sig FR_dummy = GR_Perturb
1121 (p0) fmpy.s1 FR_rsq = FR_r, FR_r
1128 // h = N * log2_lo + h
1129 // Y_hi = n * log2_hi + H
1131 (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
1137 (p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
1144 // poly_lo = r * poly_o + Q2
1145 // poly_hi = Q1 * rsq + r
1147 (p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r
1153 (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h
1159 (p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo
1161 // Create the FR for a binary "or"
1162 // Y_lo = poly_hi + poly_lo
1164 // (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
1166 // Turn the lsb of Y_lo ON
1168 // (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
1170 // Merge the new lsb into Y_lo, for alone doesn't
1172 (p0) br.cond.sptk L(LOG_main) ;;
1181 // /*******************************************************/
1182 // /*********** Branch log1p_near ************************/
1183 // /*******************************************************/
1184 (p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;;
1187 // Load base address of poly. coeff.
1191 ld8 GR_Table_Base = [GR_Table_Base]
1196 (p0) add GR_Table_ptr = 0x40,GR_Table_Base
1198 // Address tables with separate pointers
1200 (p0) ldfe FR_P8 = [GR_Table_Base],16
1205 (p0) ldfe FR_P4 = [GR_Table_ptr],16
1210 (p0) ldfe FR_P7 = [GR_Table_Base],16
1215 (p0) ldfe FR_P3 = [GR_Table_ptr],16
1220 (p0) ldfe FR_P6 = [GR_Table_Base],16
1221 (p0) fmpy.s1 FR_wsq = FR_W, FR_W ;;
1225 (p0) ldfe FR_P2 = [GR_Table_ptr],16
1232 (p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3
1239 // Y_hi = p4 * w + p3
1243 (p0) ldfe FR_P5 = [GR_Table_Base],16
1244 (p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7
1249 (p0) ldfe FR_P1 = [GR_Table_ptr],16
1253 // Y_lo = p8 * w + P7
1255 (p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq
1261 (p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2
1267 (p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6
1268 (p0) add GR_Perturb = 0x1, r0 ;;
1275 // Y_hi = y_hi * w + p2
1276 // Y_lo = y_lo * w + p6
1277 // Create perturbation bit
1279 (p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq
1285 (p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1
1289 // Y_hi = y_hi * w + p1
1294 (p0) setf.sig FR_Q4 = GR_Perturb
1295 (p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5
1301 (p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W
1308 // Y_hi = y_hi * wsq + w
1309 // Y_lo = y_lo * w + p5
1311 (p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo
1315 // (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
1317 // Set lsb on: Taken out to improve performance
1319 // (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
1321 // Make sure it's on in Y_lo also. Taken out to improve
1324 (p0) br.cond.sptk L(LOG_main) ;;
1333 // /*******************************************************/
1334 // /*********** Branch log1p_small ***********************/
1335 // /*******************************************************/
1336 (p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp
1341 (p0) mov FR_Em1 = FR_W
1342 (p0) cmp.eq.unc p7, p0 = r0, r0 ;;
1346 ld8 GR_Table_Base = [GR_Table_Base]
1347 (p0) movl GR_Expo_Range = 0x0000000000000002 ;;
1351 // Set Expo_Range = 0 for single
1352 // Set Expo_Range = 2 for double
1353 // Set Expo_Range = 4 for double-extended
1357 (p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;;
1358 (p0) ldfe FR_Threshold = [GR_Table_Base],16
1364 (p0) movl GR_Bias = 0x000000000000FF9B ;;
1368 (p0) ldfe FR_Tiny = [GR_Table_Base],0
1375 (p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold
1381 (p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W
1387 (p13) fadd FR_SCALE = f0, f1
1393 (p12) fsub.s1 FR_Y_lo = f0, FR_Tiny
1394 (p12) cmp.ne.unc p7, p0 = r0, r0
1398 (p12) setf.exp FR_SCALE = GR_Bias
1404 // Set p7 to SAFE = FALSE
1405 // Set Scale = 2^-100
1409 (p0) fma.d.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi
1418 (p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0
1424 // Raise divide by zero for +/-0 input.
1429 (p0) mov GR_Parameter_TAG = 140
1431 // If we have log1p(0), return -Inf.
1433 (p0) fsub.s0 FR_Output_X_tmp = f0, f1
1438 (p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
1439 (p0) br.cond.sptk L(LOG_ERROR_Support) ;;
1447 // Return -Inf or value from handler.
1449 (p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1
1455 // Check for Natval, QNan, SNaN, +Inf
1457 (p7) fmpy.d.s0 f8 = FR_Input_X, f1
1459 // For SNaN raise invalid and return QNaN.
1460 // For QNaN raise invalid and return QNaN.
1461 // For +Inf return +Inf.
1468 // For -Inf raise invalid and return QNaN.
1472 (p0) mov GR_Parameter_TAG = 141
1473 (p0) fmpy.d.s0 FR_Output_X_tmp = FR_Input_X, f0
1474 (p0) br.cond.sptk L(LOG_ERROR_Support) ;;
1478 // Report that log1p(-Inf) computed
1481 L(LOG_64_unsupported):
1484 // Return generated NaN or other value .
1489 (p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0
1490 (p0) br.ret.sptk b0 ;;
1498 // Deal with x < 0 in a special way
1500 (p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
1502 // Deal with x < 0 in a special way - raise
1503 // invalid and produce QNaN indefinite.
1505 (p0) mov GR_Parameter_TAG = 141
1509 ASM_SIZE_DIRECTIVE(log1p)
1511 .proc __libm_error_region
1512 __libm_error_region:
1513 L(LOG_ERROR_Support):
1518 add GR_Parameter_Y=-32,sp // Parameter 2 value
1520 .save ar.pfs,GR_SAVE_PFS
1521 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1525 add sp=-64,sp // Create new stack
1527 mov GR_SAVE_GP=gp // Save gp
1533 stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
1534 add GR_Parameter_X = 16,sp // Parameter 1 address
1535 .save b0, GR_SAVE_B0
1536 mov GR_SAVE_B0=b0 // Save b0
1542 stfd [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack
1543 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1547 stfd [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack
1548 add GR_Parameter_Y = -16,GR_Parameter_Y
1549 br.call.sptk b0=__libm_error_support# // Call error handling function
1554 add GR_Parameter_RESULT = 48,sp
1559 ldfd FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack
1561 add sp = 64,sp // Restore stack pointer
1562 mov b0 = GR_SAVE_B0 // Restore return address
1565 mov gp = GR_SAVE_GP // Restore gp
1566 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1570 .endp __libm_error_region
1571 ASM_SIZE_DIRECTIVE(__libm_error_region)
1573 .proc __libm_LOG_main
1578 // kernel_log_64 computes ln(X + E)
1583 (p7) fadd.d.s0 FR_Input_X = FR_Y_lo,FR_Y_hi
1590 (p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;;
1595 (p14) ld8 GR_Table_Base = [GR_Table_Base]
1600 (p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;;
1601 (p14) ldfe FR_1LN10_lo = [GR_Table_Base]
1607 (p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
1613 (p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
1619 (p14) fma.d.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
1620 (p0) br.ret.sptk b0 ;;
1622 .endp __libm_LOG_main
1623 ASM_SIZE_DIRECTIVE(__libm_LOG_main)
1626 .type __libm_error_support#,@function
1627 .global __libm_error_support#