3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
41 //==============================================================
42 // 2/02/00 Initial version
43 // 2/07/00 Modified calculation of acos_corr to correct acosl
44 // 4/04/00 Unwind support added
45 // 8/15/00 Bundle added after call to __libm_error_support to properly
46 // set [the previously overwritten] GR_Parameter_RESULT.
47 // 12/20/00 Set denormal flag properly.
50 //==============================================================
51 // double-extended = acosl (double-extended)
52 // input floating point f8
53 // output floating point f8
56 //==============================================================
58 // predicate registers used:
61 // floating-point registers used:
62 // f8 has input, then output
63 // f8 -> f15, f32 ->f99
65 // general registers used:
68 // Overview of operation
69 //==============================================================
70 // There are three paths
71 // 1. |x| < 2^-25 ACOS_TINY
72 // 2. 2^-25 <= |x| < 1/4 ACOS_POLY
73 // 3. 1/4 <= |x| < 1 ACOS_ATAN
75 #include "libm_support.h"
78 //==============================================================
80 // f8 is input, but acos_V must be put in f8
81 // when __libm_atan2_reg is called, f8 must get V
82 // f9 gets U when __libm_atan2_reg is called
85 // __libm_atan2_reg returns
94 // When we call __libm_atan2_reg, we must save
102 // The rest of the assembly macros
153 acos_ABS_NORM_f8 = f72
171 acos_pi_by_2_hi = f85
172 acos_pi_by_2_lo = f86
173 acos_xmpi_by_2_lo = f87
195 acos_GR_17_ones = r33
196 acos_GR_16_ones = r34
197 acos_GR_signexp_f8 = r35
199 acos_GR_true_exp = r37
206 // r40 is address of table of coefficients
211 GR_Parameter_RESULT = r46
212 GR_Parameter_TAG = r47
216 // A true exponent of -40 is
217 // : -40 + register_bias
218 // : -28 + ffff = ffd7
220 // A true exponent of 1 is
221 // : 1 + register_bias
222 // : 1 + ffff = 10000
225 //==============================================================
236 ASM_TYPE_DIRECTIVE(acos_coefficients,@object)
237 data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi
238 data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo
239 data8 0xc90fdaa22168c234, 0x00004000 // pi_hi
240 data8 0xc4c6628b80dc1cd1, 0x00003FC0 // pi_lo
242 data8 0xBB08911F2013961E, 0x00003FF8 // A10
243 data8 0x981F1095A23A87D3, 0x00003FF8 // A9
244 data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8
245 data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7
246 data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6
247 data8 0xB745D09B2B0E850B, 0x00003FF9 // A5
248 data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4
249 data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3
250 data8 0x99999999999AF376, 0x00003FFB // A2
251 data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1
252 ASM_SIZE_DIRECTIVE(acos_coefficients)
257 ASM_TYPE_DIRECTIVE(acosl#,@function)
266 // After normalizing f8, get its true exponent
268 alloc r32 = ar.pfs,1,11,4,0
269 (p0) fnorm.s1 acos_NORM_f8 = f8
270 (p0) mov acos_GR_17_ones = 0x1ffff
274 (p0) mov acos_GR_16_ones = 0xffff
275 (p0) addl r40 = @ltoff(acos_coefficients), gp
280 // Set denormal flag on denormal input with fcmp
283 fcmp.eq p6,p0 = f8,f0
289 // Load the constants pi_by_2 and pi.
290 // Each is stored as hi and lo values
291 // Also load the coefficients for ACOS_POLY
294 (p0) ldfe acos_pi_by_2_hi = [r40],16 ;;
295 (p0) ldfe acos_pi_by_2_lo = [r40],16
300 (p0) ldfe acos_pi_hi = [r40],16 ;;
301 (p0) ldfe acos_pi_lo = [r40],16
306 (p0) ldfe acos_A10 = [r40],16 ;;
307 (p0) ldfe acos_A9 = [r40],16
311 // Take the absolute value of f8
314 (p0) getf.exp acos_GR_signexp_f8 = acos_NORM_f8
315 (p0) fmerge.s acos_ABS_NORM_f8 = f0, acos_NORM_f8
319 (p0) ldfe acos_A8 = [r40],16
321 (p0) and acos_GR_exp = acos_GR_signexp_f8, acos_GR_17_ones ;;
324 // case 1: |x| < 2^-25 ==> p6 ACOS_TINY
325 // case 2: 2^-25 <= |x| < 2^-2 ==> p8 ACOS_POLY
326 // case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
327 // case 4: 1 <= |x| ==> p11 ACOS_ERROR_RETURN
328 // Admittedly |x| = 1 is not an error but this is where that case is
332 (p0) ldfe acos_A7 = [r40],16
333 (p0) sub acos_GR_true_exp = acos_GR_exp, acos_GR_16_ones ;;
334 (p0) cmp.ge.unc p6, p7 = -26, acos_GR_true_exp ;;
338 (p0) ldfe acos_A6 = [r40],16
339 (p7) cmp.ge.unc p8, p9 = -3, acos_GR_true_exp ;;
340 (p9) cmp.ge.unc p10, p11 = -1, acos_GR_true_exp
344 (p0) ldfe acos_A5 = [r40],16 ;;
345 (p0) ldfe acos_A4 = [r40],16
350 (p0) ldfe acos_A3 = [r40],16 ;;
351 (p0) ldfe acos_A2 = [r40],16
355 // ACOS_ERROR_RETURN ==> p11 is true
358 (p0) ldfe acos_A1 = [r40],16
360 (p11) br.spnt L(ACOS_ERROR_RETURN) ;;
363 // ACOS_TINY ==> p6 is true
364 // case 1: |x| < 2^-25
367 (p6) fms.s1 acos_xmpi_by_2_lo = acos_NORM_f8,f1, acos_pi_by_2_lo
373 (p6) fms.s0 f8 = acos_pi_by_2_hi,f1, acos_xmpi_by_2_lo
374 (p6) br.ret.spnt b0 ;;
379 // ACOS_POLY ==> p8 is true
380 // case 2: 2^-25 <= |x| < 2^-2
383 (p8) fms.s1 acos_W = acos_pi_by_2_hi, f1, acos_NORM_f8
389 (p8) fma.s1 acos_X2 = f8,f8, f0
395 (p8) fms.s1 acos_Ww = acos_pi_by_2_hi, f1, acos_W
401 (p8) fma.s1 acos_X4 = acos_X2,acos_X2, f0
407 (p8) fms.s1 acos_Ww = acos_Ww, f1, acos_NORM_f8
413 (p8) fma.s1 acos_P810 = acos_X4, acos_A10, acos_A8
417 // acos_P79 = X4*A9 + A7
418 // acos_P810 = X4*A10 + A8
421 (p8) fma.s1 acos_P79 = acos_X4, acos_A9, acos_A7
427 (p8) fma.s1 acos_Ww = acos_Ww, f1, acos_pi_by_2_lo
433 (p8) fma.s1 acos_P610 = acos_X4, acos_P810, acos_A6
438 // acos_P59 = X4*(X4*A9 + A7) + A5
439 // acos_P610 = X4*(X4*A10 + A8) + A6
442 (p8) fma.s1 acos_P59 = acos_X4, acos_P79, acos_A5
448 (p8) fma.s1 acos_P410 = acos_X4, acos_P610, acos_A4
452 // acos_P39 = X4*(X4*(X4*A9 + A7) + A5) + A3
453 // acos_P410 = X4*(X4*(X4*A10 + A8) + A6) + A4
456 (p8) fma.s1 acos_P39 = acos_X4, acos_P59, acos_A3
462 (p8) fma.s1 acos_P210 = acos_X4, acos_P410, acos_A2
466 // acos_P19 = X4*(X4*(X4*(X4*A9 + A7) + A5) + A3) + A1 = P1
467 // acos_P210 = X4*(X4*(X4*(X4*A10 + A8) + A6) + A4) + A2 = P2
470 (p8) fma.s1 acos_P19 = acos_X4, acos_P39, acos_A1
474 // acos_P1P2 = Xsq*P2 + P1
475 // acos_P1P2 = Xsq*(Xsq*P2 + P1)
478 (p8) fma.s1 acos_P1P2 = acos_X2, acos_P210, acos_P19
484 (p8) fma.s1 acos_P1P2 = acos_X2, acos_P1P2, f0
490 (p8) fms.s1 acos_xPmw = acos_NORM_f8, acos_P1P2, acos_Ww
496 (p8) fms.s0 f8 = acos_W, f1, acos_xPmw
497 (p8) br.ret.spnt b0 ;;
502 // case 3: 2^-2 <= |x| < 1
503 // case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
505 // Step 1.1: Get A,B and a,b
508 // Note also that we will use acos_corr (f13)
512 // Call __libm_atan2_reg
516 (p0) mov acos_GR_fffe = 0xfffe
517 (p0) fma.s1 acos_B = f1,f1, acos_ABS_NORM_f8
518 (p0) mov GR_SAVE_B0 = b0 ;;
522 (p0) mov GR_SAVE_GP = gp
524 (p0) fms.s1 acos_A = f1,f1, acos_ABS_NORM_f8
528 (p0) setf.exp acos_HALF = acos_GR_fffe
535 (p0) fms.s1 acos_1mB = f1,f1, acos_B
539 // We want atan2(V,U)
540 // so put V in f8 and U in f9
541 // but save X in acos_X
545 (p0) fmerge.se acos_X = f8, f8
550 /////////////////////////
552 /////////////////////////
556 (p0) frsqrta.s1 acos_y0,p8 = acos_B
562 (p0) fms.s1 acos_1mA = f1,f1, acos_A
568 (p0) fma.s1 acos_Bb = acos_1mB,f1, acos_ABS_NORM_f8
574 (p0) fma.s1 acos_Hh = acos_HALF, acos_B, f0
580 (p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
586 (p0) fms.s1 acos_Aa = acos_1mA,f1, acos_ABS_NORM_f8
592 (p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
598 (p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
604 /////////////////////////
606 /////////////////////////
609 (p0) frsqrta.s1 acos_y0,p8 = acos_A
615 (p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
621 (p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
627 (p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
633 (p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
639 (p0) fma.s1 acos_S = acos_B, acos_y2, f0
645 (p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
651 (p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
657 (p0) fma.s1 acos_Hh = acos_HALF, acos_A, f0
663 (p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_B
669 (p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
675 (p0) fma.s1 acos_U = acos_Dd, acos_H, acos_S
681 (p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
687 (p0) fma.s1 acos_2U = acos_U, f1, acos_U
693 (p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
699 // sqrt(A + a) = V + v
700 // sqrt(B + b) = U + u
702 /////////////////////////
704 /////////////////////////
706 // acos_BmUU = B - UU
707 // acos_BmUUpb = (B - UU) + b
711 (p0) fnma.s1 acos_BmUU = acos_U, acos_U, acos_B
717 (p0) fmerge.se f9 = acos_U, acos_U
723 (p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
727 // acos_1d2U = frcpa(2U)
730 (p0) frcpa.s1 acos_1d2U,p9 = f1, acos_2U
736 (p0) fma.s1 acos_BmUUpb = acos_BmUU, f1, acos_Bb
742 (p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
748 // acos_Uu = ((B - UU) + b) * frcpa(2U)
749 (p0) fma.s1 acos_Uu = acos_BmUUpb, acos_1d2U, f0
755 (p0) fma.s1 acos_S = acos_A, acos_y2, f0
761 (p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
767 (p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
773 (p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_A
779 (p0) fma.s1 acos_V = acos_Dd, acos_H, acos_S
785 (p0) fma.s1 acos_2V = acos_V, f1, acos_V
790 /////////////////////////
791 // Calculate the correction, acos_corr
792 /////////////////////////
793 // acos_corr = U*v - (V*u)
797 (p0) fma.s1 acos_Vu = acos_V,acos_Uu, f0
801 /////////////////////////
803 /////////////////////////
804 // acos_AmVV = A - VV
805 // acos_AmVVpa = (A - VV) + a
809 (p0) fnma.s1 acos_AmVV = acos_V, acos_V, acos_A
815 (p0) fmerge.se f8 = acos_V, acos_V
821 (p0) fma.s1 acos_AmVVpa = acos_AmVV, f1, acos_Aa
825 // acos_1d2V = frcpa(2V)
828 (p0) frcpa.s1 acos_1d2V,p9 = f1, acos_2V
832 // acos_Vv = ((A - VV) + a) * frcpa(2V)
835 (p0) fma.s1 acos_Vv = acos_AmVVpa, acos_1d2V, f0
841 (p0) fma.s1 acos_Uv = acos_U,acos_Vv, f0
847 ASM_SIZE_DIRECTIVE(acosl#)
856 .save ar.pfs,GR_SAVE_PFS
857 mov GR_SAVE_PFS=ar.pfs
871 (p0) fms.s1 acos_corr = acos_Uv,f1, acos_Vu
872 (p0) br.call.sptk.many b0=__libm_atan2_reg# ;;
876 // p6 ==> X is negative
877 // p7 ==> x is positive
878 // We know that |X| >= 1/4
881 (p0) mov gp = GR_SAVE_GP
882 (p0) fcmp.lt.unc p6,p7 = acos_X , f0
883 (p0) mov b0 = GR_SAVE_B0 ;;
886 // acos_2_Z_hi = 2 * acos_Z_hi
887 // acos_s_lo_Z_lo = s_lo * Z_lo
891 (p0) fma.s1 acos_2_Z_hi = acos_Z_hi, f1, acos_Z_hi
892 (p0) mov ar.pfs = GR_SAVE_PFS
897 (p0) fma.s1 acos_s_lo_Z_lo = acos_s_lo, acos_Z_lo, f0
901 // 2 is a constant needed later
904 (p0) fma.s1 acos_2 = f1,f1,f1
909 // acos_result_lo = 2(s_lo * Z_lo) - corr
910 // f8 = (2*Z_hi) + (2(s_lo * Z_lo) - corr)
914 (p7) fma.s1 acos_result_lo = acos_s_lo_Z_lo, acos_2, acos_corr
920 (p7) fma.s0 f8 = acos_2_Z_hi, f1, acos_result_lo
924 // acos_result_lo = (pi_lo - corr)
925 // acos_result_lo = (pi_lo - corr) + acos_Ww
928 (p6) fms.s1 acos_result_lo = acos_pi_lo, f1, acos_corr
933 // acos_W = pi_hi - 2 * Z_hi
936 (p6) fnma.s1 acos_W = acos_2, acos_Z_hi, acos_pi_hi
940 // acos_Ww = pi_hi - W
941 // acos_Ww = (pi_hi - W) + (2 * Z_hi)
944 (p6) fms.s1 acos_Ww = acos_pi_hi, f1, acos_W
950 (p6) fms.s1 acos_Ww = acos_Ww, f1, acos_2_Z_hi
956 (p6) fma.s1 acos_result_lo = acos_result_lo, f1, acos_Ww
960 // acos_Z_lo = ((pi_lo - corr) + acos_Ww) - 2 * (s_lo * Z_lo)
963 (p6) fnma.s1 acos_Z_lo = acos_s_lo_Z_lo, acos_2, acos_result_lo
969 (p6) fma.s0 f8 = acos_W, f1, acos_Z_lo
970 (p0) br.ret.sptk b0 ;;
973 ASM_SIZE_DIRECTIVE(__libm_callout)
980 (p0) fma.s0 f8 = f8,f1,f0
981 (p0) br.ret.sptk b0 ;;
984 L(ACOS_ERROR_RETURN):
985 // Save ar.pfs, b0, and gp; restore on exit
987 // qnan snan inf norm unorm 0 -+
988 // 1 1 0 0 0 0 11 = 0xc3
990 // Coming in as X = +- 1
991 // What should we return?
993 // If X is 1, return (sign of X)pi/2
998 (p0) fcmp.eq.unc p6,p7 = acos_ABS_NORM_f8,f1
1004 (p6) fcmp.lt.unc p8,p9 = f8,f0
1010 (p8) fma.s0 f8 = acos_pi_hi, f1, acos_pi_lo
1016 (p9) fmerge.s f8 = f8,f0
1017 (p6) br.ret.spnt b0 ;;
1020 // If X is a NAN, leave
1023 (p0) fclass.m.unc p12,p0 = f8, 0xc3
1029 (p12) fma.s0 f8 = f8,f1,f0
1030 (p12) br.ret.spnt b0 ;;
1034 (p0) mov GR_Parameter_TAG = 57
1035 (p0) frcpa f10, p6 = f0, f0
1040 ASM_SIZE_DIRECTIVE(SPECIAL)
1042 .proc __libm_error_region
1043 __libm_error_region:
1047 add GR_Parameter_Y=-32,sp // Parameter 2 value
1049 .save ar.pfs,GR_SAVE_PFS
1050 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1054 add sp=-64,sp // Create new stack
1056 mov GR_SAVE_GP=gp // Save gp
1062 stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
1063 add GR_Parameter_X = 16,sp // Parameter 1 address
1064 .save b0, GR_SAVE_B0
1065 mov GR_SAVE_B0=b0 // Save b0
1071 stfe [GR_Parameter_X] = f8 // Store Parameter 1 on stack
1072 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1073 nop.b 0 // Parameter 3 address
1076 stfe [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
1077 add GR_Parameter_Y = -16,GR_Parameter_Y
1078 br.call.sptk b0=__libm_error_support# // Call error handling function
1083 add GR_Parameter_RESULT = 48,sp
1088 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1090 add sp = 64,sp // Restore stack pointer
1091 mov b0 = GR_SAVE_B0 // Restore return address
1095 mov gp = GR_SAVE_GP // Restore gp
1096 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1097 br.ret.sptk b0 // Return
1100 .endp __libm_error_region
1101 ASM_SIZE_DIRECTIVE(__libm_error_region)
1103 .type __libm_error_support#,@function
1104 .global __libm_error_support#
1106 .type __libm_atan2_reg#,@function
1107 .global __libm_atan2_reg#