Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>

author Roland McGrath <roland@gnu.org>

Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)

committer Roland McGrath <roland@gnu.org>

Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)
author Roland McGrath <roland@gnu.org>
Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)
committer Roland McGrath <roland@gnu.org>
Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)
diff --git a/.cvsignore b/.cvsignore

index cf479d2b3b843683b1abc383f48e552822ee4412..ff3f67f64644a1cd7f03973110fe541b0e49a0d1 100644 (file)
--- a/.cvsignore
+++ b/.cvsignore
@@ -5,7 +5,7 @@ glibc-*
  
  configparms 
  
-sun4 i386 i386-gnuelf hp300-netbsd hp300 i486-linux
+sun[43]* i[345]86* hp300*
  
  ieeetest hppa-sysdeps regex
  
diff --git a/ChangeLog b/ChangeLog

index 618dd3e438f0980d884140d4db07dc8c53c88c0f..d8a781c0f5ca5b36649db4a904c427178a60d8e6 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,43 @@
+Sat Oct 14 02:52:36 1995  Ulrich Drepper  <drepper@ipd.info.uni-karlsruhe.de>
+
+       * malloc/malloc.c (_malloc_internal): Performance fix.  Move
+       if statement out of loop.
+
+       * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
+       implementation using GMP functions.  Contributed by
+       Torbjorn Granlund and Ulrich Drepper.
+
+       * stdio/test_rdwr.c: Include <errno.h>.
+
+       * sysdeps/i386/i586/Implies: New file.
+
+       New highly optimized string functions for i[345]86.
+       * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
+        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
+        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
+        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
+        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
+        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
+        * sysdeps/i386/i586/strlen.S: New file.
+       * sysdeps/i386/memchr.c: Removed.  There is now an assembler version.
+
+       * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
+       not correspond to used values.
+
+       * sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
+        around a kernel header file.
+       * sysdeps/unix/sysv/linux/Dist: Add it.
+       * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
+       Likewise.
+
+       * sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
+        defining ourself we use a kernel header file.
+
+       * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
+        call handler for i586.
+
+       * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
+
  Wed Oct 11 00:00:00 1995  Roland McGrath  <roland@churchy.gnu.ai.mit.edu>
  
         * sysdeps/i386/dl-machine.h (elf_machine_rel): Use +=, not =, to
diff --git a/configure.in b/configure.in

index 05191befa52c9dd9da1c544086ce9d16304e6b89..e7d4ecbb39c22e20f10709a37b4c7ac1a3f64a4c 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -82,22 +82,18 @@ changequote(,)dnl
  # Expand the configuration machine name into a subdirectory by architecture
  # type and particular chip.
  case "$machine" in
-i[345]86)
-  machine=i386/$machine ;;
-sparc[6789])
-  machine=sparc/$machine ;;
-m68k)
-  machine=m68k/m68020 ;;
-m680?0)
-  machine=m68k/$machine ;;
-m88k)
-  machine=m88k/m88100 ;;
-m88???)
-  machine=m88k/$machine ;;
-mips64*)
-  machine=mips/mips64/$machine ;;
-mips*)
-  machine=mips/$machine ;;
+a29k | am29000)        machine=a29k ;;
+alpha*)                machine=alpha/$machine ;;
+hppa*)         machine=hppa/$machine ;;
+i[345]86)      machine=i386/$machine ;;
+m680?0)                machine=m68k/$machine ;;
+m68k)          machine=m68k/m68020 ;;
+m88???)                machine=m88k/$machine ;;
+m88k)          machine=m88k/m88100 ;;
+mips*)         machine=mips/$machine ;;
+mips64*)       machine=mips/mips64/$machine ;;
+sparc[6789])   machine=sparc/$machine ;;
+supersparc)    machine=sparc/sparc8 ;;
  esac
  
  # Make sco3.2v4 become sco3.2.4 and sunos4.1.1_U1 become sunos4.1.1.U1.
diff --git a/hurd/Makefile b/hurd/Makefile

index 582f37b3fb29feefece7ee2d4bd0f4090fdeebf0..53b73487f86abe516a83b0decfe2994346ecb6dc 100644 (file)
--- a/hurd/Makefile
+++ b/hurd/Makefile
@@ -26,7 +26,7 @@ include ../Makeconfig
  
  headers = hurd.h $(interface-headers) \
           $(addprefix hurd/,fd.h id.h port.h signal.h userlink.h \
-                           resource.h threadvar.h)
+                           resource.h threadvar.h lookup.h)
  
  distribute := hurdstartup.h hurdfault.h intr-rpc.defs STATUS
  
@@ -44,7 +44,7 @@ routines = hurdstartup hurdinit \
            setauth \
            pid2task task2pid \
            getuids setuids getumask fchroot \
-          hurdsock hurdauth invoke-trans \
+          hurdsock hurdauth \
            privports \
            msgportdemux \
            fopenport \
diff --git a/hurd/hurd.h b/hurd/hurd.h

index acad15b8c4407fa10fbaf0beffec58c22611c4d8..17b5c45d89ecf5a1747236527619401ecd6e0bff 100644 (file)
--- a/hurd/hurd.h
+++ b/hurd/hurd.h
@@ -77,11 +77,16 @@ extern struct hurd_port *_hurd_ports;
  extern unsigned int _hurd_nports;
  extern volatile mode_t _hurd_umask;
  
-/* Shorthand macro for referencing _hurd_ports (see <hurd/port.h>).  */
+/* Shorthand macro for internal library code referencing _hurd_ports (see
+   <hurd/port.h>).  */
  
  #define        __USEPORT(which, expr) \
    HURD_PORT_USE (&_hurd_ports[INIT_PORT_##which], (expr))
  
+/* Function version of __USEPORT: calls OPERATE with a send right.  */
+
+extern error_t _hurd_ports_use (int which, error_t (*operate) (mach_port_t));
+
  
  /* Base address and size of the initial stack set up by the exec server.
     If using cthreads, this stack is deallocated in startup.
@@ -150,52 +155,6 @@ extern int setcttyid (mach_port_t);
  extern int __setauth (auth_t), setauth (auth_t);
  
  
-/* Split FILE into a directory and a name within the directory.  Look up a
-   port for the directory and store it in *DIR; store in *NAME a pointer
-   into FILE where the name within directory begins.  The directory lookup
-   uses CRDIR for the root directory and CWDIR for the current directory.
-   Returns zero on success or an error code.  */
-
-extern error_t __hurd_file_name_split (file_t crdir, file_t cwdir,
-                                      const char *file,
-                                      file_t *dir, char **name);
-extern error_t hurd_file_name_split (file_t crdir, file_t cwdir,
-                                    const char *file,
-                                    file_t *dir, char **name);
-
-/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>).
-   The file lookup uses CRDIR for the root directory and CWDIR for the
-   current directory.  If successful, returns zero and store the port
-   to FILE in *PORT; otherwise returns an error code. */
-
-extern error_t __hurd_file_name_lookup (file_t crdir, file_t cwdir,
-                                       const char *file,
-                                       int flags, mode_t mode,
-                                       file_t *port);
-extern error_t hurd_file_name_lookup (file_t crdir, file_t cwdir,
-                                     const char *filename,
-                                     int flags, mode_t mode,
-                                     file_t *port);
-
-/* Process the values returned by `dir_lookup' et al, and loop doing
-   `dir_lookup' calls until one returns FS_RETRY_NONE.  CRDIR is the
-   root directory used for things like symlinks to absolute file names; the
-   other arguments should be those just passed to and/or returned from
-   `dir_lookup', `fsys_getroot', or `file_invoke_translator'.  This
-   function consumes the reference in *RESULT even if it returns an error.  */
-
-extern error_t __hurd_file_name_lookup_retry (file_t crdir,
-                                             enum retry_type doretry,
-                                             char retryname[1024],
-                                             int flags, mode_t mode,
-                                             file_t *result);
-extern error_t hurd_file_name_lookup_retry (file_t crdir,
-                                           enum retry_type doretry,
-                                           char retryname[1024],
-                                           int flags, mode_t mode,
-                                           file_t *result);
-
-
  /* Split FILE into a directory and a name within the directory.  The
     directory lookup uses the current root and working directory.  If
     successful, stores in *NAME a pointer into FILE where the name
@@ -213,15 +172,15 @@ extern file_t file_name_split (const char *file, char **name);
  extern file_t __file_name_lookup (const char *file, int flags, mode_t mode);
  extern file_t file_name_lookup (const char *file, int flags, mode_t mode);
  
-/* Invoke any translator set on the node FILE represents, and return in
-   *TRANSLATED a port to the translated node.  FLAGS are as for
-   `dir_lookup' et al, but the returned port will not necessarily have
-   any more access rights than FILE does.  */
+/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>).  The
+   file lookup uses the current root directory, but uses STARTDIR as the
+   "working directory" for file relative names.  Returns a port to the file
+   if successful; otherwise sets `errno' and returns MACH_PORT_NULL.  */
  
-extern error_t __hurd_invoke_translator (file_t file, int flags,
-                                        file_t *translated);
-extern error_t hurd_invoke_translator (file_t file, int flags,
-                                      file_t *translated);
+extern file_t __file_name_lookup_under (file_t startdir, const char *file,
+                                       int flags, mode_t mode);
+extern file_t file_name_lookup_under (file_t startdir, const char *file,
+                                     int flags, mode_t mode);
  
  
  /* Open a file descriptor on a port.  FLAGS are as for `open'; flags
diff --git a/hurd/hurdinit.c b/hurd/hurdinit.c

index af892112b424278fc0386bfb4977cb3147d4a114..409d2d19a0ec9319d7cda5638ce6f8773bfe0df0 100644 (file)
--- a/hurd/hurdinit.c
+++ b/hurd/hurdinit.c
@@ -31,6 +31,12 @@ struct hurd_port *_hurd_ports;
  unsigned int _hurd_nports;
  mode_t _hurd_umask;
  
+error_t
+_hurd_ports_use (int which, error_t (*operate) (mach_port_t))
+{
+  return HURD_PORT_USE (&_hurd_ports[which], (*operate) (port));
+}
+
  void _hurd_proc_init (char **argv);
  
  DEFINE_HOOK (_hurd_subinit, (void));
diff --git a/stdio/_itoa.c b/stdio/_itoa.c

index 19e732dcfed5698953dceb6e164afb65af22aaba..caa8179624eba7100826b1d0bbb3862ea924b988 100644 (file)
--- a/stdio/_itoa.c
+++ b/stdio/_itoa.c
@@ -1,6 +1,8 @@
  /* Internal function for converting integers to ASCII.
  Copyright (C) 1994, 1995 Free Software Foundation, Inc.
  This file is part of the GNU C Library.
+Contributed by Torbjorn Granlund <tege@matematik.su.se>
+and Ulrich Drepper <drepper@gnu.ai.mit.edu>.
  
  The GNU C Library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public License as
@@ -17,13 +19,400 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
  not, write to the Free Software Foundation, Inc., 675 Mass Ave,
  Cambridge, MA 02139, USA.  */
  
+#include <gmp-mparam.h>
+#include "../stdlib/gmp.h"
+#include "../stdlib/gmp-impl.h"
+#include "../stdlib/longlong.h"
+
+#include "_itoa.h"
+
+
+/* Canonize environment.  For some architectures not all values might
+   be defined in the GMP header files.  */
+#ifndef UMUL_TIME
+# define UMUL_TIME 1
+#endif
+#ifndef UDIV_TIME
+# define UDIV_TIME 1
+#endif
+
+/* Control memory layout.  */
+#ifdef PACK
+# undef PACK
+# define PACK __attribute__ ((packed))
+#else
+# define PACK
+#endif
+
+
+/* Declare local types.  */
+struct base_table_t
+{
+#if (UDIV_TIME > 2 * UMUL_TIME)
+  mp_limb base_multiplier;
+#endif
+  char flag;
+  char post_shift;
+#if BITS_PER_MP_LIMB == 32
+  struct
+    {
+      char normalization_steps;
+      char ndigits;
+      mp_limb base PACK;
+#if UDIV_TIME > 2 * UMUL_TIME
+      mp_limb base_ninv PACK;
+#endif
+    } big;
+#endif
+};
+
+/* To reduce the memory needed we include some fields of the tables
+   only confitionally.  */
+#if BITS_PER_MP_LIMB == 32
+# if UDIV_TIME > 2 * UMUL_TIME
+#  define SEL1(X) X,
+#  define SEL2(X) ,X
+# else
+#  define SEL1(X)
+#  define SEL2(X)
+# endif
+#endif
+
+
+/* Local variables.  */
+static const struct base_table_t base_table[] =
+{
+#if BITS_PER_MP_LIMB == 64
+  /*  2 */ {0ul, 1, 1},
+  /*  3 */ {0xaaaaaaaaaaaaaaabul, 0, 1},
+  /*  4 */ {0ul, 1, 2},
+  /*  5 */ {0xcccccccccccccccdul, 0, 2},
+  /*  6 */ {0xaaaaaaaaaaaaaaabul, 0, 2},
+  /*  7 */ {0x2492492492492493ul, 1, 3},
+  /*  8 */ {0ul, 1, 3},
+  /*  9 */ {0xe38e38e38e38e38ful, 0, 3},
+  /* 10 */ {0xcccccccccccccccdul, 0, 3},
+  /* 11 */ {0x2e8ba2e8ba2e8ba3ul, 0, 1},
+  /* 12 */ {0xaaaaaaaaaaaaaaabul, 0, 3},
+  /* 13 */ {0x4ec4ec4ec4ec4ec5ul, 0, 2},
+  /* 14 */ {0x2492492492492493ul, 1, 4},
+  /* 15 */ {0x8888888888888889ul, 0, 3},
+  /* 16 */ {0ul, 1, 4},
+  /* 17 */ {0xf0f0f0f0f0f0f0f1ul, 0, 4},
+  /* 18 */ {0xe38e38e38e38e38ful, 0, 4},
+  /* 19 */ {0xd79435e50d79435ful, 0, 4},
+  /* 20 */ {0xcccccccccccccccdul, 0, 4},
+  /* 21 */ {0x8618618618618619ul, 1, 5},
+  /* 22 */ {0x2e8ba2e8ba2e8ba3ul, 0, 2},
+  /* 23 */ {0x642c8590b21642c9ul, 1, 5},
+  /* 24 */ {0xaaaaaaaaaaaaaaabul, 0, 4},
+  /* 25 */ {0x47ae147ae147ae15ul, 1, 5},
+  /* 26 */ {0x4ec4ec4ec4ec4ec5ul, 0, 3},
+  /* 27 */ {0x97b425ed097b425ful, 0, 4},
+  /* 28 */ {0x2492492492492493ul, 1, 5},
+  /* 29 */ {0x1a7b9611a7b9611bul, 1, 5},
+  /* 30 */ {0x8888888888888889ul, 0, 4},
+  /* 31 */ {0x0842108421084211ul, 1, 5},
+  /* 32 */ {0ul, 1, 5},
+  /* 33 */ {0x0f83e0f83e0f83e1ul, 0, 1},
+  /* 34 */ {0xf0f0f0f0f0f0f0f1ul, 0, 5},
+  /* 35 */ {0xea0ea0ea0ea0ea0ful, 0, 5},
+  /* 36 */ {0xe38e38e38e38e38ful, 0, 5}
+#endif
+#if BITS_PER_MP_LIMB == 32
+  /*  2 */ {SEL1(0ul) 1, 1, {0, 31, 0x80000000ul SEL2(0xfffffffful)}},
+  /*  3 */ {SEL1(0xaaaaaaabul) 0, 1, {0, 20, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
+  /*  4 */ {SEL1(0ul) 1, 2, {1, 15, 0x40000000ul SEL2(0xfffffffful)}},
+  /*  5 */ {SEL1(0xcccccccdul) 0, 2, {1, 13, 0x48c27395ul SEL2(0xc25c2684ul)}},
+  /*  6 */ {SEL1(0xaaaaaaabul) 0, 2, {0, 12, 0x81bf1000ul SEL2(0xf91bd1b6ul)}},
+  /*  7 */ {SEL1(0x24924925ul) 1, 3, {1, 11, 0x75db9c97ul SEL2(0x1607a2cbul)}},
+  /*  8 */ {SEL1(0ul) 1, 3, {1, 10, 0x40000000ul SEL2(0xfffffffful)}},
+  /*  9 */ {SEL1(0x38e38e39ul) 0, 1, {0, 10, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
+  /* 10 */ {SEL1(0xcccccccdul) 0, 3, {2, 9, 0x3b9aca00ul SEL2(0x12e0be82ul)}},
+  /* 11 */ {SEL1(0xba2e8ba3ul) 0, 3, {0, 9, 0x8c8b6d2bul SEL2(0xd24cde04ul)}},
+  /* 12 */ {SEL1(0xaaaaaaabul) 0, 3, {3, 8, 0x19a10000ul SEL2(0x3fa39ab5ul)}},
+  /* 13 */ {SEL1(0x4ec4ec4ful) 0, 2, {2, 8, 0x309f1021ul SEL2(0x50f8ac5ful)}},
+  /* 14 */ {SEL1(0x24924925ul) 1, 4, {1, 8, 0x57f6c100ul SEL2(0x74843b1eul)}},
+  /* 15 */ {SEL1(0x88888889ul) 0, 3, {0, 8, 0x98c29b81ul SEL2(0xad0326c2ul)}},
+  /* 16 */ {SEL1(0ul) 1, 4, {3, 7, 0x10000000ul SEL2(0xfffffffful)}},
+  /* 17 */ {SEL1(0xf0f0f0f1ul) 0, 4, {3, 7, 0x18754571ul SEL2(0x4ef0b6bdul)}},
+  /* 18 */ {SEL1(0x38e38e39ul) 0, 2, {2, 7, 0x247dbc80ul SEL2(0xc0fc48a1ul)}},
+  /* 19 */ {SEL1(0xaf286bcbul) 1, 5, {2, 7, 0x3547667bul SEL2(0x33838942ul)}},
+  /* 20 */ {SEL1(0xcccccccdul) 0, 4, {1, 7, 0x4c4b4000ul SEL2(0xad7f29abul)}},
+  /* 21 */ {SEL1(0x86186187ul) 1, 5, {1, 7, 0x6b5a6e1dul SEL2(0x313c3d15ul)}},
+  /* 22 */ {SEL1(0xba2e8ba3ul) 0, 4, {0, 7, 0x94ace180ul SEL2(0xb8cca9e0ul)}},
+  /* 23 */ {SEL1(0xb21642c9ul) 0, 4, {0, 7, 0xcaf18367ul SEL2(0x42ed6de9ul)}},
+  /* 24 */ {SEL1(0xaaaaaaabul) 0, 4, {4, 6, 0x0b640000ul SEL2(0x67980e0bul)}},
+  /* 25 */ {SEL1(0x51eb851ful) 0, 3, {4, 6, 0x0e8d4a51ul SEL2(0x19799812ul)}},
+  /* 26 */ {SEL1(0x4ec4ec4ful) 0, 3, {3, 6, 0x1269ae40ul SEL2(0xbce85396ul)}},
+  /* 27 */ {SEL1(0x2f684bdbul) 1, 5, {3, 6, 0x17179149ul SEL2(0x62c103a9ul)}},
+  /* 28 */ {SEL1(0x24924925ul) 1, 5, {3, 6, 0x1cb91000ul SEL2(0x1d353d43ul)}},
+  /* 29 */ {SEL1(0x8d3dcb09ul) 0, 4, {2, 6, 0x23744899ul SEL2(0xce1deceaul)}},
+  /* 30 */ {SEL1(0x88888889ul) 0, 4, {2, 6, 0x2b73a840ul SEL2(0x790fc511ul)}},
+  /* 31 */ {SEL1(0x08421085ul) 1, 5, {2, 6, 0x34e63b41ul SEL2(0x35b865a0ul)}},
+  /* 32 */ {SEL1(0ul) 1, 5, {1, 6, 0x40000000ul SEL2(0xfffffffful)}},
+  /* 33 */ {SEL1(0x3e0f83e1ul) 0, 3, {1, 6, 0x4cfa3cc1ul SEL2(0xa9aed1b3ul)}},
+  /* 34 */ {SEL1(0xf0f0f0f1ul) 0, 5, {1, 6, 0x5c13d840ul SEL2(0x63dfc229ul)}},
+  /* 35 */ {SEL1(0xd41d41d5ul) 1, 6, {1, 6, 0x6d91b519ul SEL2(0x2b0fee30ul)}},
+  /* 36 */ {SEL1(0x38e38e39ul) 0, 3, {0, 6, 0x81bf1000ul SEL2(0xf91bd1b6ul)}}
+#endif
+};
+
  /* Lower-case digits.  */
-const char _itoa_lower_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+static const char _itoa_lower_digits[]
+       = "0123456789abcdefghijklmnopqrstuvwxyz";
  /* Upper-case digits.  */
-const char _itoa_upper_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+static const char _itoa_upper_digits[]
+       = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  
-/* Cause _itoa.h to define _itoa as a real function instead of an
-   `extern inline'.  */
-#define _EXTERN_INLINE /* empty */
  
-#include "_itoa.h"
+char *
+_itoa (value, buflim, base, upper_case)
+     unsigned long long int value;
+     char *buflim;
+     unsigned int base;
+     int upper_case;
+{
+  const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
+  char *bp = buflim;
+  const struct base_table_t *brec = &base_table[base - 2];
+
+  switch (base)
+    {
+#define RUN_2N(BITS)                                                     \
+      do                                                                 \
+        {                                                                \
+         /* `unsigned long long int' always has 64 bits.  */             \
+         mp_limb work_hi = value >> (64 - BITS_PER_MP_LIMB);             \
+                                                                         \
+         if (BITS_PER_MP_LIMB == 32)                                     \
+           if (work_hi != 0)                                             \
+             {                                                           \
+               mp_limb work_lo;                                          \
+               int cnt;                                                  \
+                                                                         \
+               work_lo = value & 0xfffffffful;                           \
+               for (cnt = BITS_PER_MP_LIMB / BITS; cnt > 0; --cnt)       \
+                 {                                                       \
+                   *--bp = digits[work_lo & ((1ul << BITS) - 1)];        \
+                   work_lo >>= BITS;                                     \
+                 }                                                       \
+               if (BITS_PER_MP_LIMB % BITS != 0)                         \
+                 {                                                       \
+                   work_lo |= ((work_hi                                  \
+                                & ((1 << BITS - BITS_PER_MP_LIMB % BITS) \
+                                   - 1))                                 \
+                               << BITS_PER_MP_LIMB % BITS);              \
+                   *--bp = digits[work_lo];                              \
+                   work_hi >>= BITS - BITS_PER_MP_LIMB % BITS;           \
+                 }                                                       \
+             }                                                           \
+           else                                                          \
+             work_hi = value & 0xfffffffful;                             \
+         do                                                              \
+           {                                                             \
+             *--bp = digits[work_hi & ((1 << BITS) - 1)];                \
+             work_hi >>= BITS;                                           \
+           }                                                             \
+         while (work_hi != 0);                                           \
+       }                                                                 \
+      while (0)
+    case 8:
+      RUN_2N (3);
+      break;
+
+    case 16:
+      RUN_2N (4);
+      break;
+
+    default:
+      {
+#if BITS_PER_MP_LIMB == 64
+       mp_limb base_multiplier = brec->base_multiplier;
+       if (brec->flag)
+         while (value != 0)
+           {
+             mp_limb quo, rem, x, dummy;
+
+             umul_ppmm (x, dummy, value, base_multiplier);
+             quo = (x + ((value - x) >> 1)) >> (brec->post_shift - 1);
+             rem = value - quo * base;
+             *--bp = digits[rem];
+             value = quo;
+           }
+       else
+         while (value != 0)
+           {
+             mp_limb quo, rem, x, dummy;
+
+             umul_ppmm (x, dummy, value, base_multiplier);
+             quo = x >> brec->post_shift;
+             rem = value - quo * base;
+             *--bp = digits[rem];
+             value = quo;
+           }
+#endif
+#if BITS_PER_MP_LIMB == 32
+       mp_limb t[3];
+       int n;
+
+       /* First convert x0 to 1-3 words in base s->big.base.
+          Optimize for frequent cases of 32 bit numbers.  */
+       if ((mp_limb) (value >> 32) >= 1)
+         {
+           int big_normalization_steps = brec->big.normalization_steps;
+           mp_limb big_base_norm = brec->big.base << big_normalization_steps;
+
+           if ((mp_limb) (value >> 32) >= brec->big.base)
+             {
+               mp_limb x1hi, x1lo, r;
+               /* If you want to optimize this, take advantage of
+                  that the quotient in the first udiv_qrnnd will
+                  always be very small.  It might be faster just to
+                  subtract in a tight loop.  */
+
+#if UDIV_TIME > 2 * UMUL_TIME
+               mp_limb x, xh, xl;
+
+               if (big_normalization_steps == 0)
+                 xh = 0;
+               else
+                 xh = (mp_limb) (value >> 64 - big_normalization_steps);
+               xl = (mp_limb) (value >> 32 - big_normalization_steps);
+               udiv_qrnnd_preinv (x1hi, r, xh, xl, big_base_norm,
+                                  brec->big.base_ninv);
+
+               xl = ((mp_limb) value) << big_normalization_steps;
+               udiv_qrnnd_preinv (x1lo, x, r, xl, big_base_norm,
+                                  big_normalization_steps);
+               t[2] = x >> big_normalization_steps;
+
+               if (big_normalization_steps == 0)
+                 xh = x1hi;
+               else
+                 xh = ((x1hi << big_normalization_steps)
+                       | (x1lo >> 32 - big_normalization_steps));
+               xl = x1lo << big_normalization_steps;
+               udiv_qrnnd_preinv (t[0], x, xh, xl, big_base_norm,
+                                  big_normalization_steps);
+               t[1] = x >> big_normalization_steps;
+#elif UDIV_NEEDS_NORMALIZATION
+               mp_limb x, xh, xl;
+
+               if (big_normalization_steps == 0)
+                 xh = 0;
+               else
+                 xh = (mp_limb) (value >> 64 - big_normalization_steps);
+               xl = (mp_limb) (value >> 32 - big_normalization_steps);
+               udiv_qrnnd (x1hi, r, xh, xl, big_base_norm);
+
+               xl = ((mp_limb) value) << big_normalization_steps;
+               udiv_qrnnd (x1lo, x, r, xl, big_base_norm);
+               t[2] = x >> big_normalization_steps;
+
+               if (big_normalization_steps == 0)
+                 xh = x1hi;
+               else
+                 xh = ((x1hi << big_normalization_steps)
+                       | (x1lo >> 32 - big_normalization_steps));
+               xl = x1lo << big_normalization_steps;
+               udiv_qrnnd (t[0], x, xh, xl, big_base_norm);
+               t[1] = x >> big_normalization_steps;
+#else
+               udiv_qrnnd (x1hi, r, 0, (mp_limb) (value >> 32),
+                           brec->big.base);
+               udiv_qrnnd (x1lo, t[2], r, (mp_limb) value, brec->big.base);
+               udiv_qrnnd (t[0], t[1], x1hi, x1lo, brec->big.base);
+#endif
+               n = 3;
+             }
+           else
+             {
+#if (UDIV_TIME > 2 * UMUL_TIME)
+               mp_limb x;
+
+               value <<= brec->big.normalization_steps;
+               udiv_qrnnd_preinv (t[0], x, (mp_limb) (value >> 32),
+                                  (mp_limb) value, big_base_norm,
+                                  brec->big.base_ninv);
+               t[1] = x >> brec->big.normalization_steps;
+#elif UDIV_NEEDS_NORMALIZATION
+               mp_limb x;
+
+               value <<= big_normalization_steps;
+               udiv_qrnnd (t[0], x, (mp_limb) (value >> 32),
+                           (mp_limb) value, big_base_norm);
+               t[1] = x >> big_normalization_steps;
+#else
+               udiv_qrnnd (t[0], t[1], (mp_limb) (value >> 32),
+                           (mp_limb) value, brec->big.base);
+#endif
+               n = 2;
+             }
+         }
+       else
+         {
+           t[0] = value;
+           n = 1;
+         }
+
+       /* Convert the 1-3 words in t[], word by word, to ASCII.  */
+       do
+         {
+           mp_limb ti = t[--n];
+           int ndig_for_this_limb = 0;
+
+#if UDIV_TIME > 2 * UMUL_TIME
+           mp_limb base_multiplier = brec->base_multiplier;
+           if (brec->flag)
+             while (ti != 0)
+               {
+                 mp_limb quo, rem, x, dummy;
+
+                 umul_ppmm (x, dummy, ti, base_multiplier);
+                 quo = (x + ((ti - x) >> 1)) >> (brec->post_shift - 1);
+                 rem = ti - quo * base;
+                 *--bp = digits[rem];
+                 ti = quo;
+                 ++ndig_for_this_limb;
+               }
+           else
+             while (ti != 0)
+               {
+                 mp_limb quo, rem, x, dummy;
+
+                 umul_ppmm (x, dummy, ti, base_multiplier);
+                 quo = x >> brec->post_shift;
+                 rem = ti - quo * base;
+                 *--bp = digits[rem];
+                 ti = quo;
+                 ++ndig_for_this_limb;
+               }
+#else
+           while (ti != 0)
+             {
+               mp_limb quo, rem;
+
+               quo = ti / base;
+               rem = ti % base;
+               *--bp = digits[rem];
+               ti = quo;
+               ++ndig_for_this_limb;
+             }
+#endif
+           /* If this wasn't the most significant word, pad with zeros.  */
+           if (n != 0)
+             while (ndig_for_this_limb < brec->big.ndigits)
+               {
+                 *--bp = '0';
+                 ++ndig_for_this_limb;
+               }
+         }
+       while (n != 0);
+#endif
+      }
+      break;
+    }
+
+  return bp;
+}
diff --git a/stdio/_itoa.h b/stdio/_itoa.h

index 81240507b12805dfff80dd1e4dd3ad3dfa946d07..ab3d1d1d3a429fd102373d61745964918de90e96 100644 (file)
--- a/stdio/_itoa.h
+++ b/stdio/_itoa.h
@@ -21,8 +21,6 @@ Cambridge, MA 02139, USA.  */
  #define _ITOA_H
  #include <sys/cdefs.h>
  
-extern const char _itoa_lower_digits[], _itoa_upper_digits[];
-
  /* Convert VALUE into ASCII in base BASE (2..36).
     Write backwards starting the character just before BUFLIM.
     Return the address of the first (left-to-right) character in the number.
@@ -31,28 +29,4 @@ extern const char _itoa_lower_digits[], _itoa_upper_digits[];
  extern char *_itoa __P ((unsigned long long int value, char *buflim,
                          unsigned int base, int upper_case));
  
-#ifndef _EXTERN_INLINE
-#define _EXTERN_INLINE extern __inline 
-#endif
-
-_EXTERN_INLINE
-char *
-_itoa (unsigned long long int value, char *buflim,
-       unsigned int base, int upper_case)
-{
-  /* Base-36 digits for numbers.  */
-  const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
-
-  register char *bp = buflim;
-
-  while (value > 0)
-    {
-      *--bp = digits[value % base];
-      value /= base;
-    }
-
-  return bp;
-}
-
-
  #endif /* itoa.h */
diff --git a/stdio/test_rdwr.c b/stdio/test_rdwr.c

index 8e0c1dfadefedb93df950949fac56bffe26cd5a3..f987f16cd4cc69b055c6e2ab92fab3aaefe0e94c 100644 (file)
--- a/stdio/test_rdwr.c
+++ b/stdio/test_rdwr.c
@@ -17,6 +17,7 @@ not, write to the Free Software Foundation, Inc., 675 Mass Ave,
  Cambridge, MA 02139, USA.  */
  
  #include <ansidecl.h>
+#include <errno.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
diff --git a/stdlib/gmp-impl.h b/stdlib/gmp-impl.h

index ccffe7bb1e2b9fe142af35fed6a617197e3b001f..48d3af97610e6d00f6c9161d6c1efe470a8dfefa 100644 (file)
--- a/stdlib/gmp-impl.h
+++ b/stdlib/gmp-impl.h
@@ -19,11 +19,17 @@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  
  #if ! defined (alloca)
-#if defined (__GNUC__) || defined (__sparc__) || defined (sparc)
+#if defined (__GNUC__)
  #define alloca __builtin_alloca
  #endif
  #endif
  
+#if ! defined (alloca)
+#if defined (__sparc__) || defined (sparc) || defined (__sgi)
+#include <alloca.h>
+#endif
+#endif
+
  #ifndef NULL
  #define NULL 0L
  #endif
@@ -168,6 +174,7 @@ void _mp_default_free ();
      else                                                               \
        ____mpn_sqr_n (prodp, up, size, tspace);                         \
    } while (0);
+#define assert(trueval) do {if (!(trueval)) abort ();} while (0)
  
  /* Structure for conversion between internal binary format and
     strings in base 2..36.  */
@@ -197,9 +204,11 @@ struct bases
  extern const struct bases __mp_bases[];
  extern mp_size_t __gmp_default_fp_limb_precision;
  
-/* Divide the two-limb number in (NH,,NL) by D, with DI being a 32 bit
-   approximation to (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
-   Put the quotient in Q and the remainder in R.  */
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+   limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+   If this would yield overflow, DI should be the largest possible number
+   (i.e., only ones).  For correct operation, the most significant bit of D
+   has to be set.  Put the quotient in Q and the remainder in R.  */
  #define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
    do {                                                                 \
      mp_limb _q, _ql, _r;                                               \
@@ -226,6 +235,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
      (r) = _r;                                                          \
      (q) = _q;                                                          \
    } while (0)
+/* Like udiv_qrnnd_preinv, but for for any value D.  DNORM is D shifted left
+   so that its most significant bit is set.  LGUP is ceil(log2(D)).  */
  #define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \
    do {                                                                 \
      mp_limb n2, n10, n1, nadj, q1;                                     \
@@ -243,6 +254,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
      (r) = _xl + ((d) & _xh);                                           \
      (q) = _xh - q1;                                                    \
    } while (0)
+/* Exactly like udiv_qrnnd_preinv, but branch-free.  It is not clear which
+   version to use.  */
  #define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \
    do {                                                                 \
      mp_limb n2, n10, n1, nadj, q1;                                     \
@@ -262,22 +275,49 @@ extern mp_size_t __gmp_default_fp_limb_precision;
    } while (0)
  
  #if defined (__GNUC__)
-/* Define stuff for longlong.h asm macros.  */
-#if __GNUC_NEW_ATTR_MODE_SYNTAX
-typedef unsigned int UQItype   __attribute__ ((mode ("QI")));
-typedef         int SItype     __attribute__ ((mode ("SI")));
-typedef unsigned int USItype   __attribute__ ((mode ("SI")));
-typedef                 int DItype     __attribute__ ((mode ("DI")));
-typedef unsigned int UDItype   __attribute__ ((mode ("DI")));
-#else
+/* Define stuff for longlong.h.  */
  typedef unsigned int UQItype   __attribute__ ((mode (QI)));
  typedef         int SItype     __attribute__ ((mode (SI)));
  typedef unsigned int USItype   __attribute__ ((mode (SI)));
  typedef                 int DItype     __attribute__ ((mode (DI)));
  typedef unsigned int UDItype   __attribute__ ((mode (DI)));
-#endif
+#else
+typedef unsigned char UQItype;
+typedef         long SItype;
+typedef unsigned long USItype;
  #endif
  
  typedef mp_limb UWtype;
  typedef unsigned int UHWtype;
  #define W_TYPE_SIZE BITS_PER_MP_LIMB
+
+
+#ifndef IEEE_DOUBLE_BIG_ENDIAN
+#define IEEE_DOUBLE_BIG_ENDIAN 1
+#endif
+
+#if IEEE_DOUBLE_BIG_ENDIAN
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned long sig:1;
+      unsigned long exp:11;
+      unsigned long manh:20;
+      unsigned long manl:32;
+    } s;
+  double d;
+};
+#else
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned long manl:32;
+      unsigned long manh:20;
+      unsigned long exp:11;
+      unsigned long sig:1;
+    } s;
+  double d;
+};
+#endif
diff --git a/stdlib/gmp.h b/stdlib/gmp.h

index 95c2f1bebab913241f1c7c2073a55b4f7cbee6c2..0b2cb290146481c8c397d46e9e7235e6921a9a33 100644 (file)
--- a/stdlib/gmp.h
+++ b/stdlib/gmp.h
@@ -24,13 +24,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  #define __need_size_t
  #include <stddef.h>
  
-#ifdef __STDC__
+#if defined (__STDC__)
  #define __gmp_const const
  #else
  #define __gmp_const
  #endif
  
-#ifdef __GNUC__
+#if defined (__GNUC__)
  #define __gmp_inline inline
  #else
  #define __gmp_inline
@@ -40,9 +40,14 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  typedef unsigned int           mp_limb;
  typedef int                    mp_limb_signed;
  #else
+#if _LONG_LONG_LIMB
+typedef unsigned long long int mp_limb;
+typedef long long int          mp_limb_signed;
+#else
  typedef unsigned long int      mp_limb;
  typedef long int               mp_limb_signed;
  #endif
+#endif
  
  typedef mp_limb *              mp_ptr;
  typedef __gmp_const mp_limb *  mp_srcptr;
@@ -52,9 +57,9 @@ typedef long int              mp_exp_t;
  #ifndef __MP_SMALL__
  typedef struct
  {
-  long int alloc;              /* Number of *limbs* allocated and pointed
+  mp_size_t alloc;             /* Number of *limbs* allocated and pointed
                                    to by the D field.  */
-  long int size;               /* abs(SIZE) is the number of limbs
+  mp_size_t size;              /* abs(SIZE) is the number of limbs
                                    the last field points to.  If SIZE
                                    is negative this is a negative
                                    number.  */
@@ -130,12 +135,16 @@ typedef __mpf_struct *mpf_ptr;
  typedef __gmp_const __mpq_struct *mpq_srcptr;
  typedef __mpq_struct *mpq_ptr;
  
-#ifdef __STDC__
+#if defined (__STDC__)
  #define _PROTO(x) x
  #else
  #define _PROTO(x) ()
  #endif
  
+#if defined (FILE) || defined (_STDIO_H_) || defined (__STDIO_H__) || defined (H_STDIO)
+#define _GMP_H_HAVE_FILE 1
+#endif
+
  void mp_set_memory_functions _PROTO((void *(*) (size_t),
                                      void *(*) (void *, size_t, size_t),
                                      void (*) (void *, size_t)));
@@ -165,7 +174,7 @@ unsigned long int mpz_get_ui _PROTO ((mpz_srcptr));
  mp_limb mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t));
  mp_size_t mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr));
  void mpz_init _PROTO ((mpz_ptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
  void mpz_inp_raw _PROTO ((mpz_ptr, FILE *));
  int mpz_inp_str _PROTO ((mpz_ptr, FILE *, int));
  #endif
@@ -180,7 +189,7 @@ void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
  void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
  void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
  void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
  void mpz_out_raw _PROTO ((FILE *, mpz_srcptr));
  void mpz_out_str _PROTO ((FILE *, int, mpz_srcptr));
  #endif
@@ -218,6 +227,8 @@ void mpz_tdiv_qr_ui _PROTO((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
  void mpz_tdiv_r _PROTO((mpz_ptr, mpz_srcptr, mpz_srcptr));
  void mpz_tdiv_r_ui _PROTO((mpz_ptr, mpz_srcptr, unsigned long int));
  
+void mpz_array_init (mpz_ptr, size_t, mp_size_t);
+
  /**************** Rational (i.e. Q) routines.  ****************/
  
  void mpq_init _PROTO ((mpq_ptr));
@@ -253,7 +264,7 @@ void mpf_dump _PROTO ((mpf_srcptr));
  char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr));
  void mpf_init _PROTO ((mpf_ptr));
  void mpf_init2 _PROTO ((mpf_ptr, mp_size_t));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
  void mpf_inp_str _PROTO ((mpf_ptr, FILE *, int));
  #endif
  void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr));
@@ -265,7 +276,7 @@ void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
  void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
  void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
  void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr));
-#ifdef FILE
+#ifdef _GMP_H_HAVE_FILE
  void mpf_out_str _PROTO ((mpf_ptr, int, size_t, FILE *));
  #endif
  void mpf_set _PROTO ((mpf_ptr, mpf_srcptr));
@@ -335,7 +346,7 @@ mp_limb __mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb));
  
  
  static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
  __mpn_add_1 (register mp_ptr res_ptr,
              register mp_srcptr s1_ptr,
              register mp_size_t s1_size,
@@ -377,7 +388,7 @@ __mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb)
  }
  
  static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
  __mpn_add (register mp_ptr res_ptr,
            register mp_srcptr s1_ptr,
            register mp_size_t s1_size,
@@ -406,7 +417,7 @@ __mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
  }
  
  static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
  __mpn_sub_1 (register mp_ptr res_ptr,
              register mp_srcptr s1_ptr,
              register mp_size_t s1_size,
@@ -448,7 +459,7 @@ __mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb)
  }
  
  static __gmp_inline mp_limb
-#if __STDC__
+#if defined (__STDC__)
  __mpn_sub (register mp_ptr res_ptr,
            register mp_srcptr s1_ptr,
            register mp_size_t s1_size,
@@ -477,7 +488,7 @@ __mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
  }
  
  static __gmp_inline mp_size_t
-#if __STDC__
+#if defined (__STDC__)
  __mpn_normal_size (mp_srcptr ptr, mp_size_t size)
  #else
  __mpn_normal_size (ptr, size)
@@ -512,7 +523,6 @@ __mpn_normal_size (ptr, size)
  /* Useful synonyms, but not quite compatible with GMP 1.  */
  #define mpz_div                mpz_fdiv_q
  #define mpz_divmod     mpz_fdiv_qr
-#define mpz_mod                mpz_fdiv_r
  #define mpz_div_ui     mpz_fdiv_q_ui
  #define mpz_divmod_ui  mpz_fdiv_qr_ui
  #define mpz_mod_ui     mpz_fdiv_r_ui
diff --git a/stdlib/longlong.h b/stdlib/longlong.h

index 97c469d8c0990477d3cbdc19c223eb7c84a9656e..bbb92e3af8e710082eeb0b8bcfe24b0838018572 100644 (file)
--- a/stdlib/longlong.h
+++ b/stdlib/longlong.h
@@ -97,7 +97,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
  #define __AND_CLOBBER_CC , "cc"
  #endif /* __GNUC__ < 2 */
  
-#if (defined (__a29k__) || defined (___AM29K__)) && W_TYPE_SIZE == 32
+#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    __asm__ ("add %1,%4,%5
         addc %0,%2,%3"                                                  \
@@ -152,6 +152,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
      (pl) = __m0 * __m1;                                                        \
    } while (0)
  #define UMUL_TIME 46
+#ifndef LONGLONG_STANDALONE
  #define udiv_qrnnd(q, r, n1, n0, d) \
    do { UDItype __r;                                                    \
      (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));                                \
@@ -159,12 +160,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
    } while (0)
  extern UDItype __udiv_qrnnd ();
  #define UDIV_TIME 220
-#endif
+#endif /* LONGLONG_STANDALONE */
+#endif /* __alpha__ */
  
  #if defined (__arm__) && W_TYPE_SIZE == 32
  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
-  __asm__ ("adds %1,%4,%5
-       adc %0,%2,%3"                                                   \
+  __asm__ ("adds       %1, %4, %5
+       adc     %0, %2, %3"                                             \
            : "=r" ((USItype)(sh)),                                      \
              "=&r" ((USItype)(sl))                                      \
            : "%r" ((USItype)(ah)),                                      \
@@ -172,8 +174,8 @@ extern UDItype __udiv_qrnnd ();
              "%r" ((USItype)(al)),                                      \
              "rI" ((USItype)(bl)))
  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
-  __asm__ ("subs %1,%4,%5
-       sbc %0,%2,%3"                                                   \
+  __asm__ ("subs       %1, %4, %5
+       sbc     %0, %2, %3"                                             \
            : "=r" ((USItype)(sh)),                                      \
              "=&r" ((USItype)(sl))                                      \
            : "r" ((USItype)(ah)),                                       \
@@ -181,19 +183,19 @@ extern UDItype __udiv_qrnnd ();
              "r" ((USItype)(al)),                                       \
              "rI" ((USItype)(bl)))
  #define umul_ppmm(xh, xl, a, b) \
-  __asm__ ("; Inlined umul_ppmm
-       mov     r0,%2 lsr 16
-       mov     r2,%3 lsr 16
-       bic     r1,%2,r0 lsl 16
-       bic     r2,%3,r2 lsl 16
-       mul     %1,r1,r2
-       mul     r2,r0,r2
-       mul     r1,%0,r1
-       mul     %0,r0,%0
-       adds    r1,r2,r1
-       addcs   %0,%0,0x10000
-       adds    %1,%1,r1 lsl 16
-       adc     %0,%0,r1 lsr 16"                                        \
+  __asm__ ("%@ Inlined umul_ppmm
+       mov     %|r0, %2, lsr #16
+       mov     %|r2, %3, lsr #16
+       bic     %|r1, %2, %|r0, lsl #16
+       bic     %|r2, %3, %|r2, lsl #16
+       mul     %1, %|r1, %|r2
+       mul     %|r2, %|r0, %|r2
+       mul     %|r1, %0, %|r1
+       mul     %0, %|r0, %0
+       adds    %|r1, %|r2, %|r1
+       addcs   %0, %0, #65536
+       adds    %1, %1, %|r1, lsl #16
+       adc     %0, %0, %|r1, lsr #16"                                  \
            : "=&r" ((USItype)(xh)),                                     \
              "=r" ((USItype)(xl))                                       \
            : "r" ((USItype)(a)),                                        \
@@ -296,9 +298,9 @@ extern UDItype __udiv_qrnnd ();
            struct {USItype __h, __l;} __i;                              \
           } __xx;                                                       \
      __asm__ ("xmpyu %1,%2,%0"                                          \
-            : "=x" (__xx.__ll)                                         \
-            : "x" ((USItype)(u)),                                      \
-              "x" ((USItype)(v)));                                     \
+            : "=fx" (__xx.__ll)                                        \
+            : "fx" ((USItype)(u)),                                     \
+              "fx" ((USItype)(v)));                                    \
      (wh) = __xx.__i.__h;                                               \
      (wl) = __xx.__i.__l;                                               \
    } while (0)
@@ -308,12 +310,14 @@ extern UDItype __udiv_qrnnd ();
  #define UMUL_TIME 40
  #define UDIV_TIME 80
  #endif
+#ifndef LONGLONG_STANDALONE
  #define udiv_qrnnd(q, r, n1, n0, d) \
    do { USItype __r;                                                    \
      (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));                                \
      (r) = __r;                                                         \
    } while (0)
  extern USItype __udiv_qrnnd ();
+#endif /* LONGLONG_STANDALONE */
  #define count_leading_zeros(count, x) \
    do {                                                                 \
      USItype __tmp;                                                     \
@@ -419,8 +423,12 @@ extern USItype __udiv_qrnnd ();
    } while (0)
  #define count_trailing_zeros(count, x) \
    __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
+#ifndef UMUL_TIME
  #define UMUL_TIME 40
+#endif
+#ifndef UDIV_TIME
  #define UDIV_TIME 40
+#endif
  #endif /* 80x86 */
  
  #if defined (__i960__) && W_TYPE_SIZE == 32
@@ -442,7 +450,7 @@ extern USItype __udiv_qrnnd ();
      __w; })  
  #endif /* __i960__ */
  
-#if defined (__mc68000__) && W_TYPE_SIZE == 32
+#if (defined (__mc68000__) || defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32
  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    __asm__ ("add%.l %5,%1
         addx%.l %3,%0"                                                  \
@@ -489,38 +497,34 @@ extern USItype __udiv_qrnnd ();
            : "=d" ((USItype)(count))                                    \
            : "od" ((USItype)(x)), "n" (0))
  #else /* not mc68020 */
-#define umul_ppmm(xh, xl, a, b) \
-  __asm__ ("| Inlined umul_ppmm
-       move%.l %2,%/d0
-       move%.l %3,%/d1
-       move%.l %/d0,%/d2
-       swap    %/d0
-       move%.l %/d1,%/d3
-       swap    %/d1
-       move%.w %/d2,%/d4
-       mulu    %/d3,%/d4
-       mulu    %/d1,%/d2
-       mulu    %/d0,%/d3
-       mulu    %/d0,%/d1
-       move%.l %/d4,%/d0
-       eor%.w  %/d0,%/d0
-       swap    %/d0
-       add%.l  %/d0,%/d2
-       add%.l  %/d3,%/d2
+#define umul_ppmmxx(xh, xl, a, b) \
+  do { USItype __umul_tmp1, __umul_tmp2;                               \
+       __asm__ ("| Inlined umul_ppmm
+       move%.l %5,%3
+       move%.l %2,%0
+       move%.w %3,%1
+       swap    %3
+       swap    %0
+       mulu    %2,%1
+       mulu    %3,%0
+       mulu    %2,%3
+       swap    %2
+       mulu    %5,%2
+       add%.l  %3,%2
         jcc     1f
-       add%.l  #65536,%/d1
-1:     swap    %/d2
-       moveq   #0,%/d0
-       move%.w %/d2,%/d0
-       move%.w %/d4,%/d2
-       move%.l %/d2,%1
-       add%.l  %/d1,%/d0
-       move%.l %/d0,%0"                                                \
-          : "=g" ((USItype)(xh)),                                      \
-            "=g" ((USItype)(xl))                                       \
-          : "g" ((USItype)(a)),                                        \
-            "g" ((USItype)(b))                                         \
-          : "d0", "d1", "d2", "d3", "d4")
+       add%.l  %#0x10000,%0
+1:     move%.l %2,%3
+       clr%.w  %2
+       swap    %2
+       swap    %3
+       clr%.w  %3
+       add%.l  %3,%1
+       addx%.l %2,%0
+       | End inlined umul_ppmm"                                        \
+             : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)),           \
+               "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
+             : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
+  } while (0)
  #define UMUL_TIME 100
  #define UDIV_TIME 400
  #endif /* not mc68020 */
@@ -553,7 +557,7 @@ extern USItype __udiv_qrnnd ();
              : "r" ((USItype)(x)));                                     \
      (count) = __cbtmp ^ 31;                                            \
    } while (0)
-#if defined (__mc88110__)
+#if defined (__m88110__)
  #define umul_ppmm(wh, wl, u, v) \
    do {                                                                 \
      union {UDItype __ll;                                               \
@@ -582,10 +586,18 @@ extern USItype __udiv_qrnnd ();
  #else
  #define UMUL_TIME 17
  #define UDIV_TIME 150
-#endif /* __mc88110__ */
+#endif /* __m88110__ */
  #endif /* __m88000__ */
  
  #if defined (__mips__) && W_TYPE_SIZE == 32
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3"                                               \
+          : "=l" ((USItype)(w0)),                                      \
+            "=h" ((USItype)(w1))                                       \
+          : "d" ((USItype)(u)),                                        \
+            "d" ((USItype)(v)))
+#else
  #define umul_ppmm(w1, w0, u, v) \
    __asm__ ("multu %2,%3
         mflo %0
@@ -594,11 +606,20 @@ extern USItype __udiv_qrnnd ();
              "=d" ((USItype)(w1))                                       \
            : "d" ((USItype)(u)),                                        \
              "d" ((USItype)(v)))
+#endif
  #define UMUL_TIME 10
  #define UDIV_TIME 100
  #endif /* __mips__ */
  
  #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3"                                              \
+          : "=l" ((UDItype)(w0)),                                      \
+            "=h" ((UDItype)(w1))                                       \
+          : "d" ((UDItype)(u)),                                        \
+            "d" ((UDItype)(v)))
+#else
  #define umul_ppmm(w1, w0, u, v) \
    __asm__ ("dmultu %2,%3
         mflo %0
@@ -607,8 +628,9 @@ extern USItype __udiv_qrnnd ();
              "=d" ((UDItype)(w1))                                       \
            : "d" ((UDItype)(u)),                                        \
              "d" ((UDItype)(v)))
-#define UMUL_TIME 10
-#define UDIV_TIME 100
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
  #endif /* __mips__ */
  
  #if defined (__ns32000__) && W_TYPE_SIZE == 32
@@ -647,7 +669,7 @@ extern USItype __udiv_qrnnd ();
    } while (0)
  #endif /* __ns32000__ */
  
-#if (defined (__powerpc__) || defined (___IBMR2__)) && W_TYPE_SIZE == 32
+#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32
  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    do {                                                                 \
      if (__builtin_constant_p (bh) && (bh) == 0)                                \
@@ -676,14 +698,14 @@ extern USItype __udiv_qrnnd ();
  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    do {                                                                 \
      if (__builtin_constant_p (ah) && (ah) == 0)                                \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"              \
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"      \
                : "=r" ((USItype)(sh)),                                  \
                  "=&r" ((USItype)(sl))                                  \
                : "r" ((USItype)(bh)),                                   \
                  "rI" ((USItype)(al)),                                  \
                  "r" ((USItype)(bl)));                                  \
      else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)         \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"              \
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"      \
                : "=r" ((USItype)(sh)),                                  \
                  "=&r" ((USItype)(sl))                                  \
                : "r" ((USItype)(bh)),                                   \
@@ -716,7 +738,7 @@ extern USItype __udiv_qrnnd ();
    __asm__ ("{cntlz|cntlzw} %0,%1"                                      \
            : "=r" ((USItype)(count))                                    \
            : "r" ((USItype)(x)))
-#if defined (__powerpc__)
+#if defined (_ARCH_PPC)
  #define umul_ppmm(ph, pl, m0, m1) \
    do {                                                                 \
      USItype __m0 = (m0), __m1 = (m1);                                  \
@@ -785,16 +807,15 @@ extern USItype __udiv_qrnnd ();
              "g" ((USItype)(bh)),                                       \
              "1" ((USItype)(al)),                                       \
              "g" ((USItype)(bl)))
-/* This insn doesn't work on ancient pyramids.  */
+/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
  #define umul_ppmm(w1, w0, u, v) \
    ({union {UDItype __ll;                                               \
            struct {USItype __h, __l;} __i;                              \
           } __xx;                                                       \
-  __xx.__i.__l = u;                                                    \
-  __asm__ ("uemul %3,%0"                                               \
-          : "=r" (__xx.__i.__h),                                       \
-            "=r" (__xx.__i.__l)                                        \
-          : "1" (__xx.__i.__l),                                        \
+  __asm__ ("movw %1,%R0
+       uemul %2,%0"                                                    \
+          : "=&r" (__xx.__ll)                                          \
+          : "g" ((USItype) (u)),                                       \
              "g" ((USItype)(v)));                                       \
    (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
  #endif /* __pyr__ */
@@ -868,6 +889,20 @@ extern USItype __udiv_qrnnd ();
    } while (0)
  #endif
  
+#if defined (__sh2__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ (                                                            \
+       "dmulu.l        %2,%3
+       sts     macl,%1
+       sts     mach,%0"                                                \
+          : "=r" ((USItype)(w1)),                                      \
+            "=r" ((USItype)(w0))                                       \
+          : "r" ((USItype)(u)),                                        \
+            "r" ((USItype)(v))                                         \
+          : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
  #if defined (__sparc__) && W_TYPE_SIZE == 32
  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    __asm__ ("addcc %r4,%5,%1
@@ -901,17 +936,21 @@ extern USItype __udiv_qrnnd ();
            : "r" ((USItype)(u)),                                        \
              "r" ((USItype)(v)))
  #define UMUL_TIME 5
-/* We might want to leave this undefined for `SuperSPARC (tm)' since
-   its implementation is crippled and often traps.  */
+#ifndef SUPERSPARC     /* SuperSPARC's udiv only handles 53 bit dividends */
  #define udiv_qrnnd(q, r, n1, n0, d) \
-  __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
-          : "=&r" ((USItype)(q)),                                      \
-            "=&r" ((USItype)(r))                                       \
-          : "r" ((USItype)(n1)),                                       \
-            "r" ((USItype)(n0)),                                       \
-            "r" ((USItype)(d)))
+  do {                                                                 \
+    USItype __q;                                                       \
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                    \
+            : "=r" ((USItype)(__q))                                    \
+            : "r" ((USItype)(n1)),                                     \
+              "r" ((USItype)(n0)),                                     \
+              "r" ((USItype)(d)));                                     \
+    (r) = (n0) - __q * (d);                                            \
+    (q) = __q;                                                         \
+  } while (0)
  #define UDIV_TIME 25
-#else
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
  #if defined (__sparclite__)
  /* This has hardware multiply but not divide.  It also has two additional
     instructions scan (ffs from high bit) and divscc.  */
@@ -973,9 +1012,10 @@ extern USItype __udiv_qrnnd ();
    __asm__ ("scan %1,0,%0"                                              \
            : "=r" ((USItype)(x))                                        \
            : "r" ((USItype)(count)))
-#else
-/* SPARC without integer multiplication and divide instructions.
-   (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
  #define umul_ppmm(w1, w0, u, v) \
    __asm__ ("! Inlined umul_ppmm
         wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr
@@ -1023,6 +1063,9 @@ extern USItype __udiv_qrnnd ();
              "r" ((USItype)(v))                                         \
            : "%g1", "%g2" __AND_CLOBBER_CC)
  #define UMUL_TIME 39           /* 39 instructions */
+#endif
+#ifndef udiv_qrnnd
+#ifndef LONGLONG_STANDALONE
  #define udiv_qrnnd(q, r, n1, n0, d) \
    do { USItype __r;                                                    \
      (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));                                \
@@ -1030,8 +1073,8 @@ extern USItype __udiv_qrnnd ();
    } while (0)
  extern USItype __udiv_qrnnd ();
  #define UDIV_TIME 140
-#endif /* __sparclite__ */
-#endif /* __sparc_v8__ */
+#endif /* LONGLONG_STANDALONE */
+#endif /* udiv_qrnnd */
  #endif /* __sparc__ */
  
  #if defined (__vax__) && W_TYPE_SIZE == 32
@@ -1075,7 +1118,7 @@ extern USItype __udiv_qrnnd ();
      __xx.__i.__h = n1; __xx.__i.__l = n0;                              \
      __asm__ ("ediv %3,%2,%0,%1"                                                \
              : "=g" (q), "=g" (r)                                       \
-            : "g" (__n1n0.ll), "g" (d));                               \
+            : "g" (__xx.ll), "g" (d));                                 \
    } while (0)
  #endif /* __vax__ */
  
@@ -1173,11 +1216,12 @@ extern USItype __udiv_qrnnd ();
    do {                                                                 \
      UWtype __x0, __x1, __x2, __x3;                                     \
      UHWtype __ul, __vl, __uh, __vh;                                    \
+    UWtype __u = (u), __v = (v);                                       \
                                                                         \
-    __ul = __ll_lowpart (u);                                           \
-    __uh = __ll_highpart (u);                                          \
-    __vl = __ll_lowpart (v);                                           \
-    __vh = __ll_highpart (v);                                          \
+    __ul = __ll_lowpart (__u);                                         \
+    __uh = __ll_highpart (__u);                                                \
+    __vl = __ll_lowpart (__v);                                         \
+    __vh = __ll_highpart (__v);                                                \
                                                                         \
      __x0 = (UWtype) __ul * __vl;                                       \
      __x1 = (UWtype) __ul * __vh;                                       \
@@ -1194,6 +1238,17 @@ extern USItype __udiv_qrnnd ();
    } while (0)
  #endif
  
+#if !defined (umul_ppmm)
+#define smul_ppmm(w1, w0, u, v)                                                \
+  do {                                                                 \
+    UWtype __w1;                                                       \
+    UWtype __m0 = (u), __m1 = (v);                                     \
+    umul_ppmm (__w1, w0, __m0, __m1);                                  \
+    (w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1)                        \
+               - (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0);                \
+  } while (0)
+#endif
+
  /* Define this unconditionally, so it can be used for debugging.  */
  #define __udiv_qrnnd_c(q, r, n1, n0, d) \
    do {                                                                 \
diff --git a/sysdeps/alpha/add_n.s b/sysdeps/alpha/add_n.s

new file mode 100644 (file)

index 0000000..e1ad460
--- /dev/null
+++ b/sysdeps/alpha/add_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_add_n
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s

new file mode 100644 (file)

index 0000000..46d277d
--- /dev/null
+++ b/sysdeps/alpha/addmul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_addmul_1
+       .ent    __mpn_addmul_1 2
+__mpn_addmul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       addq    $5,$3,$3
+       cmpult  $3,$5,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s

new file mode 100644 (file)

index 0000000..2aaf041
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -0,0 +1,118 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  4
+.Loop: subq    $19,4,$19
+       unop
+
+       ldq     $6,8($18)
+       addq    $4,$0,$0
+       ldq     $5,8($17)
+       cmpult  $0,$4,$1
+       ldq     $4,16($18)
+       addq    $3,$0,$20
+       cmpult  $20,$3,$0
+       ldq     $3,16($17)
+       or      $0,$1,$0
+       addq    $6,$0,$0
+       cmpult  $0,$6,$1
+       ldq     $6,24($18)
+       addq    $5,$0,$21
+       cmpult  $21,$5,$0
+       ldq     $5,24($17)
+       or      $0,$1,$0
+       addq    $4,$0,$0
+       cmpult  $0,$4,$1
+       ldq     $4,32($18)
+       addq    $3,$0,$22
+       cmpult  $22,$3,$0
+       ldq     $3,32($17)
+       or      $0,$1,$0
+       addq    $6,$0,$0
+       cmpult  $0,$6,$1
+       addq    $5,$0,$23
+       cmpult  $23,$5,$0
+       or      $0,$1,$0
+
+       stq     $20,0($16)
+       stq     $21,8($16)
+       stq     $22,16($16)
+       stq     $23,24($16)
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s

new file mode 100644 (file)

index 0000000..fdb0895
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -0,0 +1,175 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $31,$19,$20
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       srl     $4,$20,$0       # compute function result
+
+       beq     $28,L0
+       subq    $18,$28,$18
+
+       .align  3
+Loop0: ldq     $3,-16($17)
+       subq    $16,8,$16
+       sll     $4,$19,$5
+       subq    $17,8,$17
+       subq    $28,1,$28
+       srl     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,0($16)
+       bne     $28,Loop0
+
+L0:    sll     $4,$19,$24
+       beq     $18,Lend
+ # warm up phase 1
+       ldq     $1,-16($17)
+       subq    $18,4,$18
+       ldq     $2,-24($17)
+       ldq     $3,-32($17)
+       ldq     $4,-40($17)
+       beq     $18,Lcool1
+ # warm up phase 2
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       ldq     $1,-48($17)
+       sll     $2,$19,$22
+       ldq     $2,-56($17)
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       ldq     $3,-64($17)
+       sll     $4,$19,$24
+       ldq     $4,-72($17)
+       subq    $18,4,$18
+       beq     $18,Lcool1
+       .align  4
+ # main loop
+Loop:  stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+
+       srl     $1,$20,$7
+       subq    $18,4,$18
+       sll     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       srl     $2,$20,$8
+       ldq     $1,-80($17)
+       sll     $2,$19,$22
+       ldq     $2,-88($17)
+
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+
+       srl     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       sll     $3,$19,$23
+       subq    $16,32,$16
+
+       srl     $4,$20,$6
+       ldq     $3,-96($17
+       sll     $4,$19,$24
+       ldq     $4,-104($17)
+
+       subq    $17,32,$17
+       bne     $18,Loop
+       unop
+       unop
+ # cool down phase 2/1
+Lcool1:        stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+       srl     $3,$20,$5
+       sll     $3,$19,$23
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,-40($16)
+       or      $5,$22,$5
+       stq     $8,-48($16)
+       or      $6,$23,$6
+       stq     $5,-56($16)
+       stq     $6,-64($16)
+ # cool down phase 2/3
+       stq     $24,-72($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+Lcool1:        srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       stq     $5,-24($16)
+       stq     $6,-32($16)
+       stq     $24,-40($16)
+       ret     $31,($26),1
+
+Lend   stq     $24,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s

new file mode 100644 (file)

index 0000000..1da9960
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -0,0 +1,173 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       subq    $31,$19,$20
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       sll     $4,$20,$0       # compute function result
+
+       beq     $28,L0
+       subq    $18,$28,$18
+
+       .align  3
+Loop0: ldq     $3,8($17)
+       addq    $16,8,$16
+       srl     $4,$19,$5
+       addq    $17,8,$17
+       subq    $28,1,$28
+       sll     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,-8($16)
+       bne     $28,Loop0
+
+L0:    srl     $4,$19,$24
+       beq     $18,Lend
+ # warm up phase 1
+       ldq     $1,8($17)
+       subq    $18,4,$18
+       ldq     $2,16($17)
+       ldq     $3,24($17)
+       ldq     $4,32($17)
+       beq     $18,Lcool1
+ # warm up phase 2
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       ldq     $1,40($17)
+       srl     $2,$19,$22
+       ldq     $2,48($17)
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       ldq     $3,56($17)
+       srl     $4,$19,$24
+       ldq     $4,64($17)
+       subq    $18,4,$18
+       beq     $18,Lcool2
+       .align  4
+ # main loop
+Loop:  stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+
+       sll     $1,$20,$7
+       subq    $18,4,$18
+       srl     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       sll     $2,$20,$8
+       ldq     $1,72($17)
+       srl     $2,$19,$22
+       ldq     $2,80($17)
+
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+
+       sll     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       srl     $3,$19,$23
+       addq    $16,32,$16
+
+       sll     $4,$20,$6
+       ldq     $3,88($17)
+       srl     $4,$19,$24
+       ldq     $4,96($17)
+
+       addq    $17,32,$17
+       bne     $18,Loop
+       unop
+       unop
+ # cool down phase 2/1
+Lcool2:        stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+       sll     $3,$20,$5
+       srl     $3,$19,$23
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,32($16)
+       or      $5,$22,$5
+       stq     $8,40($16)
+       or      $6,$23,$6
+       stq     $5,48($16)
+       stq     $6,56($16)
+ # cool down phase 2/3
+       stq     $24,64($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+Lcool1:        sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       stq     $5,16($16)
+       stq     $6,24($16)
+       stq     $24,32($16)
+       ret     $31,($26),1
+
+Lend:  stq     $24,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s

new file mode 100644 (file)

index 0000000..c284349
--- /dev/null
+++ b/sysdeps/alpha/lshift.s
@@ -0,0 +1,108 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $17,8,$17
+       subq    $31,$19,$7
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       srl     $4,$7,$0        # compute function result
+
+       beq     $20,L0
+       subq    $18,$20,$18
+
+       .align  3
+Loop0:
+       ldq     $3,-8($17)
+       subq    $16,8,$16
+       subq    $17,8,$17
+       subq    $20,1,$20
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,0($16)
+       bne     $20,Loop0
+
+L0:    beq     $18,Lend
+
+       .align  3
+Loop:  ldq     $3,-8($17)
+       subq    $16,32,$16
+       subq    $18,4,$18
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+
+       ldq     $4,-16($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,24($16)
+       srl     $4,$7,$2
+
+       ldq     $3,-24($17)
+       sll     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,16($16)
+       srl     $3,$7,$6
+
+       ldq     $4,-32($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,8($16)
+       srl     $4,$7,$2
+
+       subq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,0($16)
+
+       bgt     $18,Loop
+
+Lend:  sll     $4,$19,$8
+       stq     $8,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s

new file mode 100644 (file)

index 0000000..3ef194d
--- /dev/null
+++ b/sysdeps/alpha/mul_1.s
@@ -0,0 +1,84 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_mul_1
+       .ent    __mpn_mul_1 2
+__mpn_mul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       bic     $31,$31,$4      # clear cy_limb
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,8($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       stq     $3,0($16)
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,16($17)      # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       stq     $3,8($16)
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $16,8,$16       # res_ptr++
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       stq     $3,8($16)
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: stq     $3,0($16)
+       ret     $31,($26),1
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s

new file mode 100644 (file)

index 0000000..74eab04
--- /dev/null
+++ b/sysdeps/alpha/rshift.s
@@ -0,0 +1,106 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+      
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       addq    $17,8,$17
+       subq    $31,$19,$7
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       sll     $4,$7,$0        # compute function result
+
+       beq     $20,L0
+       subq    $18,$20,$18
+
+       .align  3
+Loop0:
+       ldq     $3,0($17)
+       addq    $16,8,$16
+       addq    $17,8,$17
+       subq    $20,1,$20
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,-8($16)
+       bne     $20,Loop0
+
+L0:    beq     $18,Lend
+
+       .align  3
+Loop:  ldq     $3,0($17)
+       addq    $16,32,$16
+       subq    $18,4,$18
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+
+       ldq     $4,8($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-32($16)
+       sll     $4,$7,$2
+
+       ldq     $3,16($17)
+       srl     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,-24($16)
+       sll     $3,$7,$6
+
+       ldq     $4,24($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-16($16)
+       sll     $4,$7,$2
+
+       addq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,-8($16)
+
+       bgt     $18,Loop
+
+Lend:  srl     $4,$19,$8
+       stq     $8,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/sysdeps/alpha/sub_n.s b/sysdeps/alpha/sub_n.s

new file mode 100644 (file)

index 0000000..5200025
--- /dev/null
+++ b/sysdeps/alpha/sub_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s

new file mode 100644 (file)

index 0000000..acaa11c
--- /dev/null
+++ b/sysdeps/alpha/submul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_submul_1
+       .ent    __mpn_submul_1 2
+__mpn_submul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       subq    $5,$3,$3
+       cmpult  $5,$3,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S

index 942d7a884bee4ba7b3b274957300258e27e017c3..bafafd672e3e587d9806ec5220fbcb044730b2a1 100644 (file)
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -134,7 +134,7 @@ Loop2:      cmplt   n0,0,tmp
         ret     $31,($26),1
  
  Odd:
-       /* q' in n0.  r' in n1.  */
+       /* q' in n0. r' in n1 */
         addq    n1,n0,n1
         cmpult  n1,n0,tmp       # tmp := carry from addq
         beq     tmp,LLp6
diff --git a/sysdeps/generic/divmod_1.c b/sysdeps/generic/divmod_1.c

index d156eeb00d9fcd702e24573326f2d7f4924d3449..2989d36708c910c2d7840c5ca9f0176bb8655d7f 100644 (file)
--- a/sysdeps/generic/divmod_1.c
+++ b/sysdeps/generic/divmod_1.c
@@ -83,14 +83,12 @@ __mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
              result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
              most significant bit (with weight 2**N) implicit.  */
  
-#if 0 /* This can't happen when normalization_steps != 0 */
           /* Special case for DIVISOR_LIMB == 100...000.  */
           if (divisor_limb << 1 == 0)
             divisor_limb_inverted = ~(mp_limb) 0;
           else
-#endif
-         udiv_qrnnd (divisor_limb_inverted, dummy,
-                     -divisor_limb, 0, divisor_limb);
+           udiv_qrnnd (divisor_limb_inverted, dummy,
+                       -divisor_limb, 0, divisor_limb);
  
           n1 = dividend_ptr[dividend_size - 1];
           r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
diff --git a/sysdeps/generic/mod_1.c b/sysdeps/generic/mod_1.c

index ae4ed0914fad4174220acc20553c0d06fe42cca6..8a49fb4be000596cb57a68fda2778503f273ee98 100644 (file)
--- a/sysdeps/generic/mod_1.c
+++ b/sysdeps/generic/mod_1.c
@@ -3,8 +3,6 @@
     Return the single-limb remainder.
     There are no constraints on the value of the divisor.
  
-   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
-
  Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
  
  This file is part of the GNU MP Library.
diff --git a/sysdeps/hppa/add_n.s b/sysdeps/hppa/add_n.s

new file mode 100644 (file)

index 0000000..7f3e323
--- /dev/null
+++ b/sysdeps/hppa/add_n.s
@@ -0,0 +1,57 @@
+; HP-PA  __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s1_ptr       gr25
+; s2_ptr       gr24
+; size         gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+       .code
+       .export         __mpn_add_n
+__mpn_add_n
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+
+       addib,=         -1,%r23,L$end   ; check for (SIZE == 1)
+        add            %r20,%r19,%r28  ; add first limbs ignoring cy
+
+L$loop ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+       stws,ma         %r28,4(0,%r26)
+       addib,<>        -1,%r23,L$loop
+        addc           %r20,%r19,%r28
+
+L$end  stws            %r28,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r0,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/addmul_1.s b/sysdeps/hppa/hppa1.1/addmul_1.s

new file mode 100644 (file)

index 0000000..a9dfdd1
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/addmul_1.s
@@ -0,0 +1,101 @@
+; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+       .code
+       .export         __mpn_addmul_1
+__mpn_addmul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop ldws            0(%r26),%r29
+       fldws,ma        4(%r25),%fr5
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addc            %r0,%r28,%r28
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  ldw             0(%r26),%r29
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       ldws            0(%r26),%r29
+       addc            %r0,%r28,%r28
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             0(%r26),%r29
+       fstds           %fr6,-16(%r30)
+       ldw             -12(%r30),%r1
+       ldw             -16(%r30),%r28
+       add             %r29,%r1,%r19
+       stw             %r19,0(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/mul_1.s b/sysdeps/hppa/hppa1.1/mul_1.s

new file mode 100644 (file)

index 0000000..ebf0778
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/mul_1.s
@@ -0,0 +1,97 @@
+; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+       .code
+       .export         __mpn_mul_1
+__mpn_mul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop fldws,ma        4(%r25),%fr5
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       stws,ma         %r19,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       fstds           %fr6,-16(%r30)
+       ldw             -16(%r30),%r28
+       ldo             -64(%r30),%r30
+       bv              0(%r2)
+        fstws          %fr6R,0(%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/submul_1.s b/sysdeps/hppa/hppa1.1/submul_1.s

new file mode 100644 (file)

index 0000000..44cabf4
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/submul_1.s
@@ -0,0 +1,110 @@
+; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __mpn_addmul_1, if we use
+;      sub,>>= %r29,%r19,%r22
+;      addi    1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+       .code
+       .export         __mpn_submul_1
+__mpn_submul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop ldws            0(%r26),%r29
+       fldws,ma        4(%r25),%fr5
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addc            %r0,%r28,%r28
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  ldw             0(%r26),%r29
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       ldws            0(%r26),%r29
+       addc            %r0,%r28,%r28
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             0(%r26),%r29
+       fstds           %fr6,-16(%r30)
+       ldw             -12(%r30),%r1
+       ldw             -16(%r30),%r28
+       sub             %r29,%r1,%r22
+       add             %r22,%r1,%r0
+       stw             %r22,0(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s

new file mode 100644 (file)

index 0000000..4ffef3a
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
@@ -0,0 +1,74 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr      gr26
+; n1           gr25
+; n0           gr24
+; d            gr23
+
+       .code
+L$0000 .word           0x43f00000
+       .word           0x0
+       .export         __udiv_qrnnd
+__udiv_qrnnd
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+       ldo             64(%r30),%r30
+
+       stws            %r25,-16(0,%r30)        ; n_hi
+       stws            %r24,-12(0,%r30)        ; n_lo
+       ldil            L'L$0000,%r19
+       ldo             R'L$0000(%r19),%r19
+       fldds           -16(0,%r30),%fr5
+       stws            %r23,-12(0,%r30)
+       comib,<=        0,%r25,L$1
+       fcnvxf,dbl,dbl  %fr5,%fr5
+       fldds           0(0,%r19),%fr4
+       fadd,dbl        %fr4,%fr5,%fr5
+L$1
+       fcpy,sgl        %fr0,%fr6L
+       fldws           -12(0,%r30),%fr6R
+       fcnvxf,dbl,dbl  %fr6,%fr4
+
+       fdiv,dbl        %fr5,%fr4,%fr5
+
+       fcnvfx,dbl,dbl  %fr5,%fr4
+       fstws           %fr4R,-16(%r30)
+       xmpyu           %fr4R,%fr6R,%fr6
+       ldws            -16(%r30),%r28
+       fstds           %fr6,-16(0,%r30)
+       ldws            -12(0,%r30),%r21
+       ldws            -16(0,%r30),%r20
+       sub             %r24,%r21,%r22
+       subb            %r25,%r20,%r19
+       comib,=         0,%r19,L$2
+       ldo             -64(%r30),%r30
+
+       add             %r22,%r23,%r22
+       ldo             -1(%r28),%r28
+L$2    bv              0(%r2)
+       stws            %r22,0(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/lshift.s b/sysdeps/hppa/lshift.s

new file mode 100644 (file)

index 0000000..0479f4a
--- /dev/null
+++ b/sysdeps/hppa/lshift.s
@@ -0,0 +1,65 @@
+; HP-PA  __mpn_lshift --
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s_ptr                gr25
+; size         gr24
+; cnt          gr23
+
+       .code
+       .export         __mpn_lshift
+__mpn_lshift
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       sh2add          %r24,%r25,%r25
+       sh2add          %r24,%r26,%r26
+       ldws,mb         -4(0,%r25),%r22
+       subi            32,%r23,%r1
+       mtsar           %r1
+       addib,=         -1,%r24,L$0004
+       vshd            %r0,%r22,%r28           ; compute carry out limb
+       ldws,mb         -4(0,%r25),%r29
+       addib,=         -1,%r24,L$0002
+       vshd            %r22,%r29,%r20
+
+L$loop ldws,mb         -4(0,%r25),%r22
+       stws,mb         %r20,-4(0,%r26)
+       addib,=         -1,%r24,L$0003
+       vshd            %r29,%r22,%r20
+       ldws,mb         -4(0,%r25),%r29
+       stws,mb         %r20,-4(0,%r26)
+       addib,<>        -1,%r24,L$loop
+       vshd            %r22,%r29,%r20
+
+L$0002 stws,mb         %r20,-4(0,%r26)
+       vshd            %r29,%r0,%r20
+       bv              0(%r2)
+       stw             %r20,-4(0,%r26)
+L$0003 stws,mb         %r20,-4(0,%r26)
+L$0004 vshd            %r22,%r0,%r20
+       bv              0(%r2)
+       stw             %r20,-4(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/rshift.s b/sysdeps/hppa/rshift.s

new file mode 100644 (file)

index 0000000..18d33f2
--- /dev/null
+++ b/sysdeps/hppa/rshift.s
@@ -0,0 +1,62 @@
+; HP-PA  __mpn_rshift -- 
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s_ptr                gr25
+; size         gr24
+; cnt          gr23
+
+       .code
+       .export         __mpn_rshift
+__mpn_rshift
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r22
+       mtsar           %r23
+       addib,=         -1,%r24,L$0004
+       vshd            %r22,%r0,%r28           ; compute carry out limb
+       ldws,ma         4(0,%r25),%r29
+       addib,=         -1,%r24,L$0002
+       vshd            %r29,%r22,%r20
+
+L$loop ldws,ma         4(0,%r25),%r22
+       stws,ma         %r20,4(0,%r26)
+       addib,=         -1,%r24,L$0003
+       vshd            %r22,%r29,%r20
+       ldws,ma         4(0,%r25),%r29
+       stws,ma         %r20,4(0,%r26)
+       addib,<>        -1,%r24,L$loop
+       vshd            %r29,%r22,%r20
+
+L$0002 stws,ma         %r20,4(0,%r26)
+       vshd            %r0,%r29,%r20
+       bv              0(%r2)
+       stw             %r20,0(0,%r26)
+L$0003 stws,ma         %r20,4(0,%r26)
+L$0004 vshd            %r0,%r22,%r20
+       bv              0(%r2)
+       stw             %r20,0(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/sub_n.s b/sysdeps/hppa/sub_n.s

new file mode 100644 (file)

index 0000000..daae46e
--- /dev/null
+++ b/sysdeps/hppa/sub_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s1_ptr       gr25
+; s2_ptr       gr24
+; size         gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+       .code
+       .export         __mpn_sub_n
+__mpn_sub_n
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+
+       addib,=         -1,%r23,L$end   ; check for (SIZE == 1)
+        sub            %r20,%r19,%r28  ; subtract first limbs ignoring cy
+
+L$loop ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+       stws,ma         %r28,4(0,%r26)
+       addib,<>        -1,%r23,L$loop
+        subb           %r20,%r19,%r28
+
+L$end  stws            %r28,0(0,%r26)
+       addc            %r0,%r0,%r28
+       bv              0(%r2)
+        subi           1,%r28,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/udiv_qrnnd.s b/sysdeps/hppa/udiv_qrnnd.s

new file mode 100644 (file)

index 0000000..0b069bf
--- /dev/null
+++ b/sysdeps/hppa/udiv_qrnnd.s
@@ -0,0 +1,285 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr      gr26
+; n1           gr25
+; n0           gr24
+; d            gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+       .code
+       .export         __udiv_qrnnd
+__udiv_qrnnd
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       comb,<          %r23,0,L$largedivisor
+        sub            %r0,%r23,%r1            ; clear cy as side-effect
+       ds              %r0,%r1,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r28
+       ds              %r25,%r23,%r25
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r23,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r28,%r28,%r28
+
+L$largedivisor
+       extru           %r24,31,1,%r19          ; r19 = n0 & 1
+       bb,<            %r23,31,L$odd
+        extru          %r23,30,31,%r22         ; r22 = d >> 1
+       shd             %r25,%r24,1,%r24        ; r24 = new n0
+       extru           %r25,30,31,%r25         ; r25 = new n1
+       sub             %r0,%r22,%r21
+       ds              %r0,%r21,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r22,%r25
+       sh1addl         %r25,%r19,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r24,%r24,%r28
+
+L$odd  addib,sv,n      1,%r22,L$FF..           ; r22 = (d / 2 + 1)
+       shd             %r25,%r24,1,%r24        ; r24 = new n0
+       extru           %r25,30,31,%r25         ; r25 = new n1
+       sub             %r0,%r22,%r21
+       ds              %r0,%r21,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r28
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r22,%r25
+       sh1addl         %r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+       add,nuv         %r28,%r25,%r25
+       addl            %r25,%r1,%r25
+       addc            %r0,%r28,%r28
+       sub,<<          %r25,%r23,%r0
+       addl            %r25,%r1,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF.. add,uv          %r25,%r24,%r24
+       sub,<<          %r24,%r23,%r0
+       ldo             1(%r24),%r24
+       stws            %r24,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r25,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/i386/add_n.S b/sysdeps/i386/add_n.S

index c4e71ea8c72bc62518241d9565c1085edfe1504d..c3b3c3e4e17c690cad4dcd65f36ebcd3db480975 100644 (file)
--- a/sysdeps/i386/add_n.S
+++ b/sysdeps/i386/add_n.S
@@ -1,7 +1,7 @@
  /* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
  sum in a third limb vector.
  
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
  
  This file is part of the GNU MP Library.
  
@@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:)
         subl    %eax,%edx               /* ... enter the loop */
         shrl    $2,%eax                 /* restore previous value */
  #ifdef PIC
-       call    here
-here:  leal    (Loop - 3 - here)(%eax,%eax,8),%eax
-       addl    %eax,(%esp)
-       ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+       call    L0
+L0:    leal    (%eax,%eax,8),%eax
+       addl    (%esp),%eax
+       addl    $(Loop-L0-3),%eax 
+       addl    $4,%esp
  #else
-       leal    (Loop - 3)(%eax,%eax,8),%eax    /* calc start addr in loop */
-       jmp     *%eax                   /* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+       leal    (Loop - 3)(%eax,%eax,8),%eax
  #endif
+       jmp     *%eax                   /* jump into loop */
         ALIGN (3)
  Loop:  movl    (%esi),%eax
         adcl    (%edx),%eax
diff --git a/sysdeps/i386/gmp-mparam.h b/sysdeps/i386/gmp-mparam.h

new file mode 100644 (file)

index 0000000..687f12a
--- /dev/null
+++ b/sysdeps/i386/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S

new file mode 100644 (file)

index 0000000..e3d2181
--- /dev/null
+++ b/sysdeps/i386/i486/strcat.S
@@ -0,0 +1,260 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+For Intel 80x86, x>=4.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest                (sp + 4)
+   src         (sp + 8)
+*/
+
+       .text
+ENTRY (strcat)
+       pushl %edi              /* Save callee-safe register.  */
+
+       movl 12(%esp), %ecx     /* load source pointer */
+       movl 8(%esp), %edx      /* load destination pointer */
+
+       testb $0xff, (%ecx)     /* Is source string empty? */
+       jz L8                   /* yes => return */
+
+       /* Test the first bytes separately until destination is aligned.  */
+       testb $3, %edx          /* destination pointer aligned? */
+       jz L1                   /* yes => begin scan loop */
+       testb $0xff, (%edx)     /* is end of string? */
+       jz L2                   /* yes => start appending */
+       incl %edx               /* increment source pointer */
+
+       testb $3, %edx          /* destination pointer aligned? */
+       jz L1                   /* yes => begin scan loop */
+       testb $0xff, (%edx)     /* is end of string? */
+       jz L2                   /* yes => start appending */
+       incl %edx               /* increment source pointer */
+
+       testb $3, %edx          /* destination pointer aligned? */
+       jz L1                   /* yes => begin scan loop */
+       testb $0xff, (%edx)     /* is end of string? */
+       jz L2                   /* yes => start appending */
+       incl %edx               /* increment source pointer */
+
+       /* Now we are aligned.  Begin scan loop.  */
+       jmp L1
+
+       ALIGN(4)
+
+L4:    addl $16,%edx           /* increment destination pointer for round */
+
+L1:    movl (%edx), %eax       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+
+       /* If you compare this with the algorithm in memchr.S you will
+          notice that here is an `xorl' statement missing.  But you must
+          not forget that we are looking for C == 0 and `xorl $0, %eax'
+          is a no-op.  */
+
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+
+       /* According to the algorithm we had to reverse the effect of the
+          XOR first and then test the overflow bits.  But because the
+          following XOR would destroy the carry flag and it would (in a
+          representation with more than 32 bits) not alter then last
+          overflow, we can now test this condition.  If no carry is signaled
+          no overflow must have occured in the last byte => it was 0.  */
+       jnc L3
+
+       /* We are only interested in carry bits that change due to the
+          previous add, so remove original bits */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+
+       /* Now test for the other three overflow bits.  */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+
+       /* If at least one byte of the word is C we don't get 0 in %ecx.  */
+       jnz L3
+
+       movl 4(%edx), %eax      /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L5                  /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L5                  /* one byte is NUL => stop copying */
+
+       movl 8(%edx), %eax      /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L6                  /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L6                  /* one byte is NUL => stop copying */
+
+       movl 12(%edx), %eax     /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L7                  /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jz L4                   /* no byte is NUL => carry on copying */
+
+L7:    addl $4, %edx           /* adjust source pointer */
+L6:    addl $4, %edx
+L5:    addl $4, %edx
+
+L3:    testb %al, %al          /* is first byte NUL? */
+       jz L2                   /* yes => start copying */
+       incl %edx               /* increment source pointer */
+
+       testb %ah, %ah          /* is second byte NUL? */
+       jz L2                   /* yes => start copying */
+       incl %edx               /* increment source pointer */
+
+       testl $0xff0000, %eax   /* is third byte NUL? */
+       jz L2                   /* yes => start copying */
+       incl %edx               /* increment source pointer */
+
+L2:    subl %ecx, %edx         /* reduce number of loop variants */
+
+       /* Now we have to align the source pointer.  */
+       testb $3, %ecx          /* pointer correctly aligned? */
+       jz L29                  /* yes => start copy loop */
+       movb (%ecx), %al        /* get first byte */
+       movb %al, (%ecx,%edx)   /* and store it */
+       andl %al, %al           /* is byte NUL? */
+       jz L8                   /* yes => return */
+       incl %ecx               /* increment pointer */
+
+       testb $3, %ecx          /* pointer correctly aligned? */
+       jz L29                  /* yes => start copy loop */
+       movb (%ecx), %al        /* get first byte */
+       movb %al, (%ecx,%edx)   /* and store it */
+       andl %al, %al           /* is byte NUL? */
+       jz L8                   /* yes => return */
+       incl %ecx               /* increment pointer */
+
+       testb $3, %ecx          /* pointer correctly aligned? */
+       jz L29                  /* yes => start copy loop */
+       movb (%ecx), %al        /* get first byte */
+       movb %al, (%ecx,%edx)   /* and store it */
+       andl %al, %al           /* is byte NUL? */
+       jz L8                   /* yes => return */
+       incl %ecx               /* increment pointer */
+
+       /* Now we are aligned.  */
+       jmp L29                 /* start copy loop */
+
+       ALIGN(4)
+
+L28:   movl %eax, 12(%ecx,%edx)/* store word at destination */
+       addl $16, %ecx          /* adjust pointer for full round */
+
+L29:   movl (%ecx), %eax       /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L9                  /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L9                  /* one byte is NUL => stop copying */
+       movl %eax, (%ecx,%edx)  /* store word to destination */
+
+       movl 4(%ecx), %eax      /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L91                 /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L91                 /* one byte is NUL => stop copying */
+       movl %eax, 4(%ecx,%edx) /* store word to destination */
+
+       movl 8(%ecx), %eax      /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L92                 /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L92                 /* one byte is NUL => stop copying */
+       movl %eax, 8(%ecx,%edx) /* store word to destination */
+
+       movl 12(%ecx), %eax     /* get word from source */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %eax, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L93                 /* highest byte is C => stop copying */
+       xorl %eax, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jz L28                  /* no is NUL => carry on copying */
+
+L93:   addl $4, %ecx           /* adjust pointer */
+L92:   addl $4, %ecx
+L91:   addl $4, %ecx
+
+L9:    movb %al, (%ecx,%edx)   /* store first byte of last word */
+       orb %al, %al            /* is it NUL? */
+       jz L8                   /* yes => return */
+
+       movb %ah, 1(%ecx,%edx)  /* store second byte of last word */
+       orb %ah, %ah            /* is it NUL? */
+       jz L8                   /* yes => return */
+
+       shrl $16, %eax          /* make upper bytes accessible */
+       movb %al, 2(%ecx,%edx)  /* store third byte of last word */
+       orb %al, %al            /* is it NUL? */
+       jz L8                   /* yes => return */
+
+       movb %ah, 3(%ecx,%edx)  /* store fourth byte of last word */
+
+L8:    movl 8(%esp), %eax      /* start address of destination is result */
+       popl %edi               /* restore saved register */
+
+       ret
diff --git a/sysdeps/i386/i486/strlen.S b/sysdeps/i386/i486/strlen.S

new file mode 100644 (file)

index 0000000..276563b
--- /dev/null
+++ b/sysdeps/i386/i486/strlen.S
@@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+Optimized for Intel 80x86, x>=4.
+Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+*/
+
+       .text
+ENTRY (strlen)
+       movl 4(%esp), %ecx      /* get string pointer */
+       movl %ecx, %eax         /* duplicate it */
+
+       andl $3, %ecx           /* mask alignment bits */
+       jz L1                   /* aligned => start loop */
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+       incl %eax               /* increment pointer */
+
+       xorl $3, %ecx           /* was alignment = 3? */
+       jz L1                   /* yes => now it is aligned and start loop */
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+       addl $1, %eax           /* increment pointer */
+
+       subl $1, %ecx           /* was alignment = 2? */
+       jz L1                   /* yes => now it is aligned and start loop */
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+   and `decl %ecx' resp.  The additional two byte per instruction make the
+   label 4 to be aligned on a 16 byte boundary with nops.
+
+   The following `sub $15, %eax' is part of this trick, too.  Together with
+   the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+   as expected from the algorithm.  But doing so has the advantage that
+   no jump to label 1 is necessary and so the pipeline is not flushed.  */
+
+       subl $15, %eax          /* effectively +1 */
+
+
+L4:    addl $16, %eax          /* adjust pointer for full loop */
+
+L1:    movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edx  /* magic value */
+       addl %ecx, %edx         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L3                  /* highest byte is NUL => return pointer */
+       xorl %ecx, %edx         /* (word+magic)^word */
+       orl $0xfefefeff, %edx   /* set all non-carry bits */
+       incl %edx               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L3                  /* found NUL => return pointer */
+
+       movl 4(%eax), %ecx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edx  /* magic value */
+       addl %ecx, %edx         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L5                  /* highest byte is NUL => return pointer */
+       xorl %ecx, %edx         /* (word+magic)^word */
+       orl $0xfefefeff, %edx   /* set all non-carry bits */
+       incl %edx               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L5                  /* found NUL => return pointer */
+
+       movl 8(%eax), %ecx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edx  /* magic value */
+       addl %ecx, %edx         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L6                  /* highest byte is NUL => return pointer */
+       xorl %ecx, %edx         /* (word+magic)^word */
+       orl $0xfefefeff, %edx   /* set all non-carry bits */
+       incl %edx               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L6                  /* found NUL => return pointer */
+
+       movl 12(%eax), %ecx     /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edx  /* magic value */
+       addl %ecx, %edx         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L7                  /* highest byte is NUL => return pointer */
+       xorl %ecx, %edx         /* (word+magic)^word */
+       orl $0xfefefeff, %edx   /* set all non-carry bits */
+       incl %edx               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jz L4                   /* no NUL found => continue loop */
+
+L7:    addl $4, %eax           /* adjust pointer */
+L6:    addl $4, %eax
+L5:    addl $4, %eax
+
+L3:    testb %cl, %cl          /* is first byte NUL? */
+       jz L2                   /* yes => return */
+       incl %eax               /* increment pointer */
+
+       testb %ch, %ch          /* is second byte NUL? */
+       jz L2                   /* yes => return */
+       incl %eax               /* increment pointer */
+
+       testl $0xff0000, %ecx   /* is third byte NUL? */
+       jz L2                   /* yes => return pointer */
+       incl %eax               /* increment pointer */
+
+L2:    subl 4(%esp), %eax      /* compute difference to string start */
+
+       ret
diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies

new file mode 100644 (file)

index 0000000..477cd74
--- /dev/null
+++ b/sysdeps/i386/i586/Implies
@@ -0,0 +1,2 @@
+# Code optimized for i486 is better than simple i386 code.
+i386/i486
diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S

new file mode 100644 (file)

index 0000000..9be45ed
--- /dev/null
+++ b/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr     (sp + 4)
+   s1_ptr      (sp + 8)
+   s2_ptr      (sp + 12)
+   size                (sp + 16)
+*/
+
+#define r1     %eax
+#define r2     %edx
+#define src1   %esi
+#define src2   %ebp
+#define dst    %edi
+#define x      %ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+       ALIGN (3)
+       .globl C_SYMBOL_NAME(__mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebx
+       pushl   %ebp
+
+       movl    20(%esp),dst            /* res_ptr */
+       movl    24(%esp),src1           /* s1_ptr */
+       movl    28(%esp),src2           /* s2_ptr */
+       movl    32(%esp),%ecx           /* size */
+
+       movl    (src2),x
+
+       decl    %ecx
+       movl    %ecx,r2
+       shrl    $3,%ecx
+       andl    $7,r2
+       testl   %ecx,%ecx               /* zero carry flag */
+       jz      Lend
+       pushl   r2
+
+       ALIGN (3)
+Loop:  movl    28(dst),%eax            /* fetch destination cache line */
+       leal    32(dst),dst
+
+L1:    movl    (src1),r1
+       movl    4(src1),r2
+       adcl    x,r1
+       movl    4(src2),x
+       adcl    x,r2
+       movl    8(src2),x
+       movl    r1,-32(dst)
+       movl    r2,-28(dst)
+
+L2:    movl    8(src1),r1
+       movl    12(src1),r2
+       adcl    x,r1
+       movl    12(src2),x
+       adcl    x,r2
+       movl    16(src2),x
+       movl    r1,-24(dst)
+       movl    r2,-20(dst)
+
+L3:    movl    16(src1),r1
+       movl    20(src1),r2
+       adcl    x,r1
+       movl    20(src2),x
+       adcl    x,r2
+       movl    24(src2),x
+       movl    r1,-16(dst)
+       movl    r2,-12(dst)
+
+L4:    movl    24(src1),r1
+       movl    28(src1),r2
+       adcl    x,r1
+       movl    28(src2),x
+       adcl    x,r2
+       movl    32(src2),x
+       movl    r1,-8(dst)
+       movl    r2,-4(dst)
+
+       leal    32(src1),src1
+       leal    32(src2),src2
+       decl    %ecx
+       jnz     Loop
+
+       popl    r2
+Lend:
+       decl    r2                      /* test r2 w/o clobbering carry */
+       js      Lend2
+       incl    r2
+Loop2:
+       leal    4(dst),dst
+       movl    (src1),r1
+       adcl    x,r1
+       movl    4(src2),x
+       movl    r1,-4(dst)
+       leal    4(src1),src1
+       leal    4(src2),src2
+       decl    r2
+       jnz     Loop2
+Lend2:
+       movl    (src1),r1
+       adcl    x,r1
+       movl    r1,(dst)
+
+       sbbl    %eax,%eax
+       negl    %eax
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S

new file mode 100644 (file)

index 0000000..b222840
--- /dev/null
+++ b/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,84 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr     (sp + 4)
+   s1_ptr      (sp + 8)
+   size                (sp + 12)
+   s2_limb     (sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define s2_limb ebp
+
+       TEXT
+       ALIGN (3)
+       GLOBL   C_SYMBOL_NAME(__mpn_addmul_1)
+       .type   C_SYMBOL_NAME(__mpn_addmul_1),@function
+C_SYMBOL_NAME(__mpn_addmul_1:)
+
+       INSN1(push,l    ,R(edi))
+       INSN1(push,l    ,R(esi))
+       INSN1(push,l    ,R(ebx))
+       INSN1(push,l    ,R(ebp))
+
+       INSN2(mov,l     ,R(res_ptr),MEM_DISP(esp,20))
+       INSN2(mov,l     ,R(s1_ptr),MEM_DISP(esp,24))
+       INSN2(mov,l     ,R(ecx),MEM_DISP(esp,28))
+       INSN2(mov,l     ,R(s2_limb),MEM_DISP(esp,32))
+
+       INSN2(lea,l     ,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
+       INSN2(lea,l     ,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
+       INSN1(neg,l     ,R(ecx))
+       INSN2(xor,l     ,R(edx),R(edx))
+       ALIGN (3)
+Loop:
+       INSN2(mov,l     ,R(ebx),R(edx))
+       INSN2(mov,l     ,R(eax),MEM_INDEX(s1_ptr,ecx,4))
+
+       INSN1(mul,l     ,R(s2_limb))
+
+       INSN2(add,l     ,R(eax),R(ebx))
+       INSN2(mov,l     ,R(ebx),MEM_INDEX(res_ptr,ecx,4))
+
+       INSN2(adc,l     ,R(edx),$0)
+       INSN2(add,l     ,R(ebx),R(eax))
+
+       INSN2(adc,l     ,R(edx),$0)
+       INSN2(mov,l     ,MEM_INDEX(res_ptr,ecx,4),R(ebx))
+
+       INSN1(inc,l     ,R(ecx))
+       INSN1(jnz,      ,Loop)
+
+
+       INSN2(mov,l     ,R(eax),R(edx))
+       INSN1(pop,l     ,R(ebp))
+       INSN1(pop,l     ,R(ebx))
+       INSN1(pop,l     ,R(esi))
+       INSN1(pop,l     ,R(edi))
+       ret
+Lfe1:
+       .size   C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)
diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S

new file mode 100644 (file)

index 0000000..b9f8131
--- /dev/null
+++ b/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_lshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr      (sp + 4)
+  s_ptr                (sp + 8)
+  size         (sp + 12)
+  cnt          (sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+       ALIGN (3)
+       .globl C_SYMBOL_NAME(__mpn_lshift)
+C_SYMBOL_NAME(__mpn_lshift:)
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebx
+       pushl   %ebp
+
+       movl    20(%esp),%edi           /* res_ptr */
+       movl    24(%esp),%esi           /* s_ptr */
+       movl    28(%esp),%ebp           /* size */
+       movl    32(%esp),%ecx           /* cnt */
+
+       cmp     $1,%ecx
+       jne     Lnormal
+       movl    %edi,%eax
+       subl    %esi,%eax
+       cmpl    %ebp,%eax
+       jnc     Lspecial
+
+Lnormal:
+       leal    -4(%edi,%ebp,4),%edi
+       leal    -4(%esi,%ebp,4),%esi
+
+       movl    (%esi),%edx
+       subl    $4,%esi
+       xorl    %eax,%eax
+       shldl   %cl,%edx,%eax           /* compute carry limb */
+       pushl   %eax                    /* push carry limb onto stack */
+
+       decl    %ebp
+       pushl   %ebp
+       shrl    $3,%ebp
+       jz      Lend
+
+       movl    (%edi),%eax             /* fetch destination cache line */
+
+       ALIGN   (2)
+Loop:  movl    -28(%edi),%eax          /* fetch destination cache line */
+       movl    %edx,%ebx
+
+       movl    (%esi),%eax
+       movl    -4(%esi),%edx
+       shldl   %cl,%eax,%ebx
+       shldl   %cl,%edx,%eax
+       movl    %ebx,(%edi)
+       movl    %eax,-4(%edi)
+
+       movl    -8(%esi),%ebx
+       movl    -12(%esi),%eax
+       shldl   %cl,%ebx,%edx
+       shldl   %cl,%eax,%ebx
+       movl    %edx,-8(%edi)
+       movl    %ebx,-12(%edi)
+
+       movl    -16(%esi),%edx
+       movl    -20(%esi),%ebx
+       shldl   %cl,%edx,%eax
+       shldl   %cl,%ebx,%edx
+       movl    %eax,-16(%edi)
+       movl    %edx,-20(%edi)
+
+       movl    -24(%esi),%eax
+       movl    -28(%esi),%edx
+       shldl   %cl,%eax,%ebx
+       shldl   %cl,%edx,%eax
+       movl    %ebx,-24(%edi)
+       movl    %eax,-28(%edi)
+
+       subl    $32,%esi
+       subl    $32,%edi
+       decl    %ebp
+       jnz     Loop
+
+Lend:  popl    %ebp
+       andl    $7,%ebp
+       jz      Lend2
+Loop2: movl    (%esi),%eax
+       shldl   %cl,%eax,%edx
+       movl    %edx,(%edi)
+       movl    %eax,%edx
+       subl    $4,%esi
+       subl    $4,%edi
+       decl    %ebp
+       jnz     Loop2
+
+Lend2: shll    %cl,%edx                /* compute least significant limb */
+       movl    %edx,(%edi)             /* store it */
+
+       popl    %eax                    /* pop carry limb */
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+       movl    (%esi),%edx
+       addl    $4,%esi
+
+       decl    %ebp
+       pushl   %ebp
+       shrl    $3,%ebp
+
+       addl    %edx,%edx
+       incl    %ebp
+       decl    %ebp
+       jz      LLend
+
+       movl    (%edi),%eax             /* fetch destination cache line */
+
+       ALIGN   (2)
+LLoop: movl    28(%edi),%eax           /* fetch destination cache line */
+       movl    %edx,%ebx
+
+       movl    (%esi),%eax
+       movl    4(%esi),%edx
+       adcl    %eax,%eax
+       movl    %ebx,(%edi)
+       adcl    %edx,%edx
+       movl    %eax,4(%edi)
+
+       movl    8(%esi),%ebx
+       movl    12(%esi),%eax
+       adcl    %ebx,%ebx
+       movl    %edx,8(%edi)
+       adcl    %eax,%eax
+       movl    %ebx,12(%edi)
+
+       movl    16(%esi),%edx
+       movl    20(%esi),%ebx
+       adcl    %edx,%edx
+       movl    %eax,16(%edi)
+       adcl    %ebx,%ebx
+       movl    %edx,20(%edi)
+
+       movl    24(%esi),%eax
+       movl    28(%esi),%edx
+       adcl    %eax,%eax
+       movl    %ebx,24(%edi)
+       adcl    %edx,%edx
+       movl    %eax,28(%edi)
+
+       leal    32(%esi),%esi           /* use leal not to clobber carry */
+       leal    32(%edi),%edi
+       decl    %ebp
+       jnz     LLoop
+
+LLend: popl    %ebp
+       sbbl    %eax,%eax               /* save carry in %eax */
+       andl    $7,%ebp
+       jz      LLend2
+       addl    %eax,%eax               /* restore carry from eax */
+LLoop2:        movl    %edx,%ebx
+       movl    (%esi),%edx
+       adcl    %edx,%edx
+       movl    %ebx,(%edi)
+
+       leal    4(%esi),%esi            /* use leal not to clobber carry */
+       leal    4(%edi),%edi
+       decl    %ebp
+       jnz     LLoop2
+
+       jmp     LL1
+LLend2:        addl    %eax,%eax               /* restore carry from eax */
+LL1:   movl    %edx,(%edi)             /* store last limb */
+
+       sbbl    %eax,%eax
+       negl    %eax
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h

index a9bb9e7a402b0a508f327d17b819c271c1ee19e7..0a8768788e82d128d8673ee5a0bcaa7db0757866 100644 (file)
--- a/sysdeps/i386/i586/memcopy.h
+++ b/sysdeps/i386/i586/memcopy.h
@@ -1,5 +1,5 @@
  /* memcopy.h -- definitions for memory copy functions.  Pentium version.
-   Copyright (C) 1994 Free Software Foundation, Inc.
+   Copyright (C) 1994, 1995 Free Software Foundation, Inc.
     Contributed by Torbjorn Granlund (tege@sics.se).
  
  This file is part of the GNU C Library.
@@ -88,7 +88,7 @@ Cambridge, MA 02139, USA.  */
                     "subl       $32,%2\n"                               \
                     "jns        1b\n"                                   \
                     "2: addl    $32,%2" :                               \
-                   "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :  \
-                   "0" (dst_bp), "1" (src_bp), "2" (nbytes) :          \
+                   "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :  \
+                   "0" (dst_ep), "1" (src_ep), "2" (nbytes) :          \
                     "ax", "dx");                                        \
      } while (0)
diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S

new file mode 100644 (file)

index 0000000..2b7258e
--- /dev/null
+++ b/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,78 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr     (sp + 4)
+   s1_ptr      (sp + 8)
+   size                (sp + 12)
+   s2_limb     (sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+       TEXT
+       ALIGN (3)
+       GLOBL   C_SYMBOL_NAME(__mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
+
+       INSN1(push,l    ,R(edi))
+       INSN1(push,l    ,R(esi))
+       INSN1(push,l    ,R(ebx))
+       INSN1(push,l    ,R(ebp))
+
+       INSN2(mov,l     ,R(res_ptr),MEM_DISP(esp,20))
+       INSN2(mov,l     ,R(s1_ptr),MEM_DISP(esp,24))
+       INSN2(mov,l     ,R(size),MEM_DISP(esp,28))
+       INSN2(mov,l     ,R(s2_limb),MEM_DISP(esp,32))
+
+       INSN2(lea,l     ,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+       INSN2(lea,l     ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+       INSN1(neg,l     ,R(size))
+       INSN2(xor,l     ,R(edx),R(edx))
+       ALIGN (3)
+Loop:
+       INSN2(mov,l     ,R(ebx),R(edx))
+       INSN2(mov,l     ,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+       INSN1(mul,l     ,R(s2_limb))
+
+       INSN2(add,l     ,R(eax),R(ebx))
+
+       INSN2(adc,l     ,R(edx),$0)
+       INSN2(mov,l     ,MEM_INDEX(res_ptr,size,4),R(eax))
+
+       INSN1(inc,l     ,R(size))
+       INSN1(jnz,      ,Loop)
+
+
+       INSN2(mov,l     ,R(eax),R(edx))
+       INSN1(pop,l     ,R(ebp))
+       INSN1(pop,l     ,R(ebx))
+       INSN1(pop,l     ,R(esi))
+       INSN1(pop,l     ,R(edi))
+       ret
diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S

new file mode 100644 (file)

index 0000000..51cde8f
--- /dev/null
+++ b/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_rshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr      (sp + 4)
+  s_ptr                (sp + 8)
+  size         (sp + 12)
+  cnt          (sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+       ALIGN (3)
+       .globl C_SYMBOL_NAME(__mpn_rshift)
+C_SYMBOL_NAME(__mpn_rshift:)
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebx
+       pushl   %ebp
+
+       movl    20(%esp),%edi           /* res_ptr */
+       movl    24(%esp),%esi           /* s_ptr */
+       movl    28(%esp),%ebp           /* size */
+       movl    32(%esp),%ecx           /* cnt */
+
+       cmp     $1,%ecx
+       jne     Lnormal
+       movl    %edi,%eax
+       subl    %esi,%eax
+       cmpl    %ebp,%eax
+       jnc     Lspecial
+
+Lnormal:
+       movl    (%esi),%edx
+       addl    $4,%esi
+       xorl    %eax,%eax
+       shrdl   %cl,%edx,%eax           /* compute carry limb */
+       pushl   %eax                    /* push carry limb onto stack */
+
+       decl    %ebp
+       pushl   %ebp
+       shrl    $3,%ebp
+       jz      Lend
+
+       movl    (%edi),%eax             /* fetch destination cache line */
+
+       ALIGN   (2)
+Loop:  movl    28(%edi),%eax           /* fetch destination cache line */
+       movl    %edx,%ebx
+
+       movl    (%esi),%eax
+       movl    4(%esi),%edx
+       shrdl   %cl,%eax,%ebx
+       shrdl   %cl,%edx,%eax
+       movl    %ebx,(%edi)
+       movl    %eax,4(%edi)
+
+       movl    8(%esi),%ebx
+       movl    12(%esi),%eax
+       shrdl   %cl,%ebx,%edx
+       shrdl   %cl,%eax,%ebx
+       movl    %edx,8(%edi)
+       movl    %ebx,12(%edi)
+
+       movl    16(%esi),%edx
+       movl    20(%esi),%ebx
+       shrdl   %cl,%edx,%eax
+       shrdl   %cl,%ebx,%edx
+       movl    %eax,16(%edi)
+       movl    %edx,20(%edi)
+
+       movl    24(%esi),%eax
+       movl    28(%esi),%edx
+       shrdl   %cl,%eax,%ebx
+       shrdl   %cl,%edx,%eax
+       movl    %ebx,24(%edi)
+       movl    %eax,28(%edi)
+
+       addl    $32,%esi
+       addl    $32,%edi
+       decl    %ebp
+       jnz     Loop
+
+Lend:  popl    %ebp
+       andl    $7,%ebp
+       jz      Lend2
+Loop2: movl    (%esi),%eax
+       shrdl   %cl,%eax,%edx           /* compute result limb */
+       movl    %edx,(%edi)
+       movl    %eax,%edx
+       addl    $4,%esi
+       addl    $4,%edi
+       decl    %ebp
+       jnz     Loop2
+
+Lend2: shrl    %cl,%edx                /* compute most significant limb */
+       movl    %edx,(%edi)             /* store it */
+
+       popl    %eax                    /* pop carry limb */
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+       leal    -4(%edi,%ebp,4),%edi
+       leal    -4(%esi,%ebp,4),%esi
+
+       movl    (%esi),%edx
+       subl    $4,%esi
+
+       decl    %ebp
+       pushl   %ebp
+       shrl    $3,%ebp
+
+       shrl    $1,%edx
+       incl    %ebp
+       decl    %ebp
+       jz      LLend
+
+       movl    (%edi),%eax             /* fetch destination cache line */
+
+       ALIGN   (2)
+LLoop: movl    -28(%edi),%eax          /* fetch destination cache line */
+       movl    %edx,%ebx
+
+       movl    (%esi),%eax
+       movl    -4(%esi),%edx
+       rcrl    $1,%eax
+       movl    %ebx,(%edi)
+       rcrl    $1,%edx
+       movl    %eax,-4(%edi)
+
+       movl    -8(%esi),%ebx
+       movl    -12(%esi),%eax
+       rcrl    $1,%ebx
+       movl    %edx,-8(%edi)
+       rcrl    $1,%eax
+       movl    %ebx,-12(%edi)
+
+       movl    -16(%esi),%edx
+       movl    -20(%esi),%ebx
+       rcrl    $1,%edx
+       movl    %eax,-16(%edi)
+       rcrl    $1,%ebx
+       movl    %edx,-20(%edi)
+
+       movl    -24(%esi),%eax
+       movl    -28(%esi),%edx
+       rcrl    $1,%eax
+       movl    %ebx,-24(%edi)
+       rcrl    $1,%edx
+       movl    %eax,-28(%edi)
+
+       leal    -32(%esi),%esi          /* use leal not to clobber carry */
+       leal    -32(%edi),%edi
+       decl    %ebp
+       jnz     LLoop
+
+LLend: popl    %ebp
+       sbbl    %eax,%eax               /* save carry in %eax */
+       andl    $7,%ebp
+       jz      LLend2
+       addl    %eax,%eax               /* restore carry from eax */
+LLoop2:        movl    %edx,%ebx
+       movl    (%esi),%edx
+       rcrl    $1,%edx
+       movl    %ebx,(%edi)
+
+       leal    -4(%esi),%esi           /* use leal not to clobber carry */
+       leal    -4(%edi),%edi
+       decl    %ebp
+       jnz     LLoop2
+
+       jmp     LL1
+LLend2:        addl    %eax,%eax               /* restore carry from eax */
+LL1:   movl    %edx,(%edi)             /* store last limb */
+
+       movl    $0,%eax
+       rcrl    $1,%eax
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S

new file mode 100644 (file)

index 0000000..982c80e
--- /dev/null
+++ b/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,334 @@
+/* strchr -- find character CH in a NUL terminated string.
+Highly optimized version for ix85, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   ch          (sp + 8)
+*/
+
+       .text
+ENTRY (strchr)
+       pushl %edi              /* Save callee-safe registers.  */
+       pushl %esi
+
+       pushl %ebx
+       pushl %ebp
+
+       movl 20(%esp), %eax     /* get string pointer */
+       movl 24(%esp), %edx     /* get character we are looking for */
+
+       movl %eax, %edi         /* duplicate string pointer for later */
+       xorl %ecx, %ecx         /* clear %ecx */
+
+       /* At the moment %edx contains C.  What we need for the
+          algorithm is C in all bytes of the dword.  Avoid
+          operations on 16 bit words because these require an
+          prefix byte (and one more cycle).  */
+       movb %dl, %dh           /* now it is 0|0|c|c */
+       movb %dl, %cl           /* we construct the lower half in %ecx */
+
+       shll $16, %edx          /* now %edx is c|c|0|0 */
+       movb %cl, %ch           /* now %ecx is 0|0|c|c */
+
+       orl %ecx, %edx          /* and finally c|c|c|c */
+       andl $3, %edi           /* mask alignment bits */
+
+       jz L11                  /* alignment is 0 => start loop */
+
+       movb (%eax), %cl        /* load single byte */
+       cmpb %cl, %dl           /* is byte == C? */
+
+       je L2                   /* aligned => return pointer */
+
+       cmp $0, %cl             /* is byte NUL? */
+       je L3                   /* yes => return NULL */
+
+       incl %eax               /* increment pointer */
+       cmp $3, %edi            /* was alignment == 3? */
+
+       je L11                  /* yes => start loop */
+
+       movb (%eax), %cl        /* load single byte */
+       cmpb %cl, %dl           /* is byte == C? */
+
+       je L2                   /* aligned => return pointer */
+
+       cmp $0, %cl             /* is byte NUL? */
+       je L3                   /* yes => return NULL */
+
+       incl %eax               /* increment pointer */
+       cmp $2, %edi            /* was alignment == 2? */
+
+       je L11                  /* yes => start loop */
+
+       movb (%eax), %cl        /* load single byte */
+       cmpb %cl, %dl           /* is byte == C? */
+
+       je L2                   /* aligned => return pointer */
+
+       cmp $0, %cl             /* is byte NUL? */
+       je L3                   /* yes => return NULL */
+
+       incl %eax               /* increment pointer */
+
+       /* The following code is the preparation for the loop.  The
+          four instruction up to `L1' will not be executed in the loop
+          because the same code is found at the end of the loop, but
+          there it is executed in parallel with other instructions.  */
+L11:   movl (%eax), %ecx
+       movl $magic, %ebp
+
+       movl $magic, %edi
+       addl %ecx, %ebp
+
+       /* The main loop: it looks complex and indeed it is.  I would
+          love to say `it was hard to write, so it should he hard to
+          read' but I will give some more hints.  To fully understand
+          this code you should first take a look at the i486 version.
+          The basic algorithm is the same, but here the code organized
+          in a way which permits to use both pipelines all the time.
+
+          I tried to make it a bit more understandable by indenting
+          the code according to stage in the algorithm.  It goes as
+          follows:
+               check for 0 in 1st word
+                       check for C in 1st word
+                                       check for 0 in 2nd word
+                                               check for C in 2nd word
+               check for 0 in 3rd word
+                       check for C in 3rd word
+                                       check for 0 in 4th word
+                                               check for C in 4th word
+
+          Please note that doing the test for NUL before the test for
+          C allows us to overlap the test for 0 in the next word with
+          the test for C.  */
+
+L1:    xorl %ecx, %ebp                 /* (word^magic) */
+       addl %ecx, %edi                 /* add magic word */
+
+       leal 4(%eax), %eax              /* increment pointer */
+       jnc L4                          /* previous addl caused overflow? */
+
+               movl %ecx, %ebx         /* duplicate original word */
+       orl $magic, %ebp                /* (word^magic)|magic */
+
+       addl $1, %ebp                   /* (word^magic)|magic == 0xffffffff? */
+       jne L4                          /* yes => we found word with NUL */
+
+               movl $magic, %esi       /* load magic value */
+               xorl %edx, %ebx         /* clear words which are C */
+
+                                       movl (%eax), %ecx
+               addl %ebx, %esi         /* (word+magic) */
+
+                                       movl $magic, %edi
+               jnc L5                  /* previous addl caused overflow? */
+
+                                       movl %edi, %ebp
+               xorl %ebx, %esi         /* (word+magic)^word */
+
+                                       addl %ecx, %ebp
+               orl $magic, %esi        /* ((word+magic)^word)|magic */
+
+               addl $1, %esi           /* ((word+magic)^word)|magic==0xf..f?*/
+               jne L5                  /* yes => we found word with C */
+
+                                       xorl %ecx, %ebp
+                                       addl %ecx, %edi
+
+                                       leal 4(%eax), %eax
+                                       jnc L4
+
+                                               movl %ecx, %ebx
+                                       orl $magic, %ebp
+
+                                       addl $1, %ebp
+                                       jne L4
+
+                                               movl $magic, %esi
+                                               xorl %edx, %ebx
+
+       movl (%eax), %ecx
+                                               addl %ebx, %esi
+
+       movl $magic, %edi
+                                               jnc L5
+
+       movl %edi, %ebp
+                                               xorl %ebx, %esi
+
+       addl %ecx, %ebp
+                                               orl $magic, %esi
+
+                                               addl $1, %esi
+                                               jne L5
+
+       xorl %ecx, %ebp
+       addl %ecx, %edi
+
+       leal 4(%eax), %eax
+       jnc L4
+
+               movl %ecx, %ebx
+       orl $magic, %ebp
+
+       addl $1, %ebp
+       jne L4
+
+               movl $magic, %esi
+               xorl %edx, %ebx
+
+                                       movl (%eax), %ecx
+               addl %ebx, %esi
+
+                                       movl $magic, %edi
+               jnc L5
+
+                                       movl %edi, %ebp
+               xorl %ebx, %esi
+
+                                       addl %ecx, %ebp
+               orl $magic, %esi
+
+               addl $1, %esi
+               jne L5
+
+                                       xorl %ecx, %ebp
+                                       addl %ecx, %edi
+
+                                       leal 4(%eax), %eax
+                                       jnc L4
+
+                                               movl %ecx, %ebx
+                                       orl $magic, %ebp
+
+                                       addl $1, %ebp
+                                       jne L4
+
+                                               movl $magic, %esi
+                                               xorl %edx, %ebx
+
+       movl (%eax), %ecx
+                                               addl %ebx, %esi
+
+       movl $magic, %edi
+                                               jnc L5
+
+       movl %edi, %ebp
+                                               xorl %ebx, %esi
+
+       addl %ecx, %ebp
+                                               orl $magic, %esi
+
+                                               addl $1, %esi
+
+                                               je L1
+
+       /* We know there is no NUL byte but a C byte in the word.
+          %ebx contains NUL in this particular byte.  */
+L5:    subl $4, %eax           /* adjust pointer */
+       testb %bl, %bl          /* first byte == C? */
+
+       jz L2                   /* yes => return pointer */
+
+       incl %eax               /* increment pointer */
+       testb %bh, %bh          /* second byte == C? */
+
+       jz L2                   /* yes => return pointer */
+
+       shrl $16, %ebx          /* make upper bytes accessible */
+       incl %eax               /* increment pointer */
+
+       cmp $0, %bl             /* third byte == C */
+       je L2                   /* yes => return pointer */
+
+       incl %eax               /* increment pointer */
+
+L2:    popl %ebp               /* restore saved registers */
+       popl %ebx
+
+       popl %esi
+       popl %edi
+
+       ret
+
+       /* We know there is a NUL byte in the word.  But we have to test
+          whether there is an C byte before it in the word.  */
+L4:    subl $4, %eax           /* adjust pointer */
+       cmpb %dl, %cl           /* first byte == C? */
+
+       je L2                   /* yes => return pointer */
+
+       cmpb $0, %cl            /* first byte == NUL? */
+       je L3                   /* yes => return NULL */
+
+       incl %eax               /* increment pointer */
+
+       cmpb %dl, %ch           /* second byte == C? */
+       je L2                   /* yes => return pointer */
+
+       cmpb $0, %ch            /* second byte == NUL? */
+       je L3                   /* yes => return NULL */
+
+       shrl $16, %ecx          /* make upper bytes accessible */
+       incl %eax               /* increment pointer */
+
+       cmpb %dl, %cl           /* third byte == C? */
+       je L2                   /* yes => return pointer */
+
+       cmpb $0, %cl            /* third byte == NUL? */
+       je L3                   /* yes => return NULL */
+
+       incl %eax               /* increment pointer */
+
+       /* The test four the fourth byte is necessary!  */
+       cmpb %dl, %ch           /* fourth byte == C? */
+       je L2                   /* yes => return pointer */
+
+L3:    xorl %eax, %eax         /* set return value = NULL */
+
+       popl %ebp               /* restore saved registers */
+       popl %ebx
+
+       popl %esi
+       popl %edi
+
+       ret
+
+#undef index
+weak_alias (strchr, index)
diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S

new file mode 100644 (file)

index 0000000..b807ed4
--- /dev/null
+++ b/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,185 @@
+/* strlen -- Compute length og NUL terminated string.
+Highly optimized version for ix86, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+*/
+
+       .text
+ENTRY(strlen)
+       movl 4(%esp), %eax      /* get string pointer */
+
+       movl %eax, %ecx         /* duplicate it */
+       andl $3, %ecx           /* mask alignment bits */
+
+       jz L11                  /* aligned => start loop */
+
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+
+       incl %eax               /* increment pointer */
+       cmpl $3, %ecx           /* was alignment = 3? */
+
+       je L11                  /* yes => now it is aligned and start loop */
+
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+
+       incl %eax               /* increment pointer */
+       cmpl $2, %ecx           /* was alignment = 2? */
+
+       je L11                  /* yes => now it is aligned and start loop */
+
+       cmpb %ch, (%eax)        /* is byte NUL? */
+       je L2                   /* yes => return */
+
+       incl %eax               /* increment pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+        change any of the hole bits of LONGWORD.
+
+        1) Is this safe?  Will it catch all the zero bytes?
+        Suppose there is a byte with all zeros.  Any carry bits
+        propagating from its left will fall into the hole at its
+        least significant bit and stop.  Since there will be no
+        carry from its most significant bit, the LSB of the
+        byte to the left will be unchanged, and the zero will be
+        detected.
+
+        2) Is this worthwhile?  Will it ignore everything except
+        zero bytes?  Suppose every byte of LONGWORD has a bit set
+        somewhere.  There will be a carry into bit 8.  If bit 8
+        is set, this will carry into bit 16.  If bit 8 is clear,
+        one of bits 9-15 must be set, so there will be a carry
+        into bit 16.  Similarly, there will be a carry into bit
+        24.  If one of bits 24-31 is set, there will be a carry
+        into bit 32 (=carry flag), so all of the hole bits will
+        be changed.  */
+L11:   xorl %edx, %edx         /* We need %edx == 0 for later */
+
+L1:
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       addl $4, %eax           /* adjust pointer for *next* word */
+
+       subl %ecx, %edx         /* first step to negate word */
+       addl $magic, %ecx       /* add magic word */
+
+       decl %edx               /* complete negation of word */
+       jnc L3                  /* previous addl caused overflow? */
+
+       xorl %ecx, %edx         /* (word+magic)^word */
+       subl $magic, %ecx       /* undo previous addl to restore word */
+
+       andl $~magic, %edx      /* any of the carry flags set? */
+
+       jne L3                  /* yes => determine byte */
+
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       addl $4, %eax           /* adjust pointer for *next* word */
+
+       subl %ecx, %edx         /* first step to negate word */
+       addl $magic, %ecx       /* add magic word */
+
+       decl %edx               /* complete negation of word */
+       jnc L3                  /* previous addl caused overflow? */
+
+       xorl %ecx, %edx         /* (word+magic)^word */
+       subl $magic, %ecx       /* undo previous addl to restore word */
+
+       andl $~magic, %edx      /* any of the carry flags set? */
+
+       jne L3                  /* yes => determine byte */
+
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       addl $4, %eax           /* adjust pointer for *next* word */
+
+       subl %ecx, %edx         /* first step to negate word */
+       addl $magic, %ecx       /* add magic word */
+
+       decl %edx               /* complete negation of word */
+       jnc L3                  /* previous addl caused overflow? */
+
+       xorl %ecx, %edx         /* (word+magic)^word */
+       subl $magic, %ecx       /* undo previous addl to restore word */
+
+       andl $~magic, %edx      /* any of the carry flags set? */
+
+       jne L3                  /* yes => determine byte */
+
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       addl $4, %eax           /* adjust pointer for *next* word */
+
+       subl %ecx, %edx         /* first step to negate word */
+       addl $magic, %ecx       /* add magic word */
+
+       decl %edx               /* wcomplete negation of ord */
+       jnc L3                  /* previous addl caused overflow? */
+
+       xorl %ecx, %edx         /* (word+magic)^word */
+       subl $magic, %ecx       /* undo previous addl to restore word */
+
+       andl $~magic, %edx      /* any of the carry flags set? */
+
+       je L1                   /* no => start loop again */
+
+
+L3:    subl $4, %eax           /* correct too early pointer increment */
+       testb %cl, %cl          /* lowest byte NUL? */
+
+       jz L2                   /* yes => return */
+
+       inc %eax                /* increment pointer */
+       testb %ch, %ch          /* second byte NUL? */
+
+       jz L2                   /* yes => return */
+
+       shrl $16, %ecx          /* make upper bytes accessible */
+       incl %eax               /* increment pointer */
+
+       cmpb $0, %cl            /* is third byte NUL? */
+       jz L2                   /* yes => return */
+
+       incl %eax               /* increment pointer */
+
+L2:    subl 4(%esp), %eax      /* now compute the length as difference
+                                  between start and terminating NUL
+                                  character */
+
+       ret
diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S

new file mode 100644 (file)

index 0000000..1382e66
--- /dev/null
+++ b/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr     (sp + 4)
+   s1_ptr      (sp + 8)
+   s2_ptr      (sp + 12)
+   size                (sp + 16)
+*/
+
+#define r1     %eax
+#define r2     %edx
+#define src1   %esi
+#define src2   %ebp
+#define dst    %edi
+#define x      %ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+       ALIGN (3)
+       .globl C_SYMBOL_NAME(__mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebx
+       pushl   %ebp
+
+       movl    20(%esp),dst            /* res_ptr */
+       movl    24(%esp),src1           /* s1_ptr */
+       movl    28(%esp),src2           /* s2_ptr */
+       movl    32(%esp),%ecx           /* size */
+
+       movl    (src2),x
+
+       decl    %ecx
+       movl    %ecx,r2
+       shrl    $3,%ecx
+       andl    $7,r2
+       testl   %ecx,%ecx               /* zero carry flag */
+       jz      Lend
+       pushl   r2
+
+       ALIGN (3)
+Loop:  movl    28(dst),%eax            /* fetch destination cache line */
+       leal    32(dst),dst
+
+L1:    movl    (src1),r1
+       movl    4(src1),r2
+       sbbl    x,r1
+       movl    4(src2),x
+       sbbl    x,r2
+       movl    8(src2),x
+       movl    r1,-32(dst)
+       movl    r2,-28(dst)
+
+L2:    movl    8(src1),r1
+       movl    12(src1),r2
+       sbbl    x,r1
+       movl    12(src2),x
+       sbbl    x,r2
+       movl    16(src2),x
+       movl    r1,-24(dst)
+       movl    r2,-20(dst)
+
+L3:    movl    16(src1),r1
+       movl    20(src1),r2
+       sbbl    x,r1
+       movl    20(src2),x
+       sbbl    x,r2
+       movl    24(src2),x
+       movl    r1,-16(dst)
+       movl    r2,-12(dst)
+
+L4:    movl    24(src1),r1
+       movl    28(src1),r2
+       sbbl    x,r1
+       movl    28(src2),x
+       sbbl    x,r2
+       movl    32(src2),x
+       movl    r1,-8(dst)
+       movl    r2,-4(dst)
+
+       leal    32(src1),src1
+       leal    32(src2),src2
+       decl    %ecx
+       jnz     Loop
+
+       popl    r2
+Lend:
+       decl    r2                      /* test r2 w/o clobbering carry */
+       js      Lend2
+       incl    r2
+Loop2:
+       leal    4(dst),dst
+       movl    (src1),r1
+       sbbl    x,r1
+       movl    4(src2),x
+       movl    r1,-4(dst)
+       leal    4(src1),src1
+       leal    4(src2),src2
+       decl    r2
+       jnz     Loop2
+Lend2:
+       movl    (src1),r1
+       sbbl    x,r1
+       movl    r1,(dst)
+
+       sbbl    %eax,%eax
+       negl    %eax
+
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       popl    %edi
+       ret
diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S

new file mode 100644 (file)

index 0000000..14bfe54
--- /dev/null
+++ b/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,82 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr     (sp + 4)
+   s1_ptr      (sp + 8)
+   size                (sp + 12)
+   s2_limb     (sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+       TEXT
+       ALIGN (3)
+       GLOBL   C_SYMBOL_NAME(__mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
+
+       INSN1(push,l    ,R(edi))
+       INSN1(push,l    ,R(esi))
+       INSN1(push,l    ,R(ebx))
+       INSN1(push,l    ,R(ebp))
+
+       INSN2(mov,l     ,R(res_ptr),MEM_DISP(esp,20))
+       INSN2(mov,l     ,R(s1_ptr),MEM_DISP(esp,24))
+       INSN2(mov,l     ,R(size),MEM_DISP(esp,28))
+       INSN2(mov,l     ,R(s2_limb),MEM_DISP(esp,32))
+
+       INSN2(lea,l     ,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+       INSN2(lea,l     ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+       INSN1(neg,l     ,R(size))
+       INSN2(xor,l     ,R(edx),R(edx))
+       ALIGN (3)
+Loop:
+       INSN2(mov,l     ,R(ebx),R(edx))
+       INSN2(mov,l     ,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+       INSN1(mul,l     ,R(s2_limb))
+
+       INSN2(add,l     ,R(eax),R(ebx))
+       INSN2(mov,l     ,R(ebx),MEM_INDEX(res_ptr,size,4))
+
+       INSN2(adc,l     ,R(edx),$0)
+       INSN2(sub,l     ,R(ebx),R(eax))
+
+       INSN2(adc,l     ,R(edx),$0)
+       INSN2(mov,l     ,MEM_INDEX(res_ptr,size,4),R(ebx))
+
+       INSN1(inc,l     ,R(size))
+       INSN1(jnz,      ,Loop)
+
+
+       INSN2(mov,l     ,R(eax),R(edx))
+       INSN1(pop,l     ,R(ebp))
+       INSN1(pop,l     ,R(ebx))
+       INSN1(pop,l     ,R(esi))
+       INSN1(pop,l     ,R(edi))
+       ret
diff --git a/sysdeps/i386/memchr.S b/sysdeps/i386/memchr.S

new file mode 100644 (file)

index 0000000..9931f97
--- /dev/null
+++ b/sysdeps/i386/memchr.S
@@ -0,0 +1,315 @@
+/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
+   than N.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+This version is developed using the same algorithm as the fast C
+version which carries the following introduction:
+
+Based on strlen implemention by Torbjorn Granlund (tege@sics.se),
+with help from Dan Sahlin (dan@sics.se) and
+commentary by Jim Blandy (jimb@ai.mit.edu);
+adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+and implemented by Roland McGrath (roland@ai.mit.edu).
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   c           (sp + 8)
+   len         (sp + 12)
+*/
+
+       .text
+ENTRY (memchr)
+       /* Save callee-safe registers used in this function.  */
+       pushl %esi
+       pushl %edi
+
+       /* Load parameters into registers.  */
+       movl 12(%esp), %eax     /* str: pointer to memory block.  */
+       movl 16(%esp), %edx     /* c: byte we are looking for.  */
+       movl 20(%esp), %esi     /* len: length of memory block.  */
+
+       /* If my must not test more than three characters test
+          them one by one.  This is especially true for 0.  */
+       cmpl $4, %esi
+       jb L3
+
+       /* At the moment %edx contains C.  What we need for the
+          algorithm is C in all bytes of the dword.  Avoid
+          operations on 16 bit words because these require an
+          prefix byte (and one more cycle).  */
+       movb %dl, %dh           /* Now it is 0|0|c|c */
+       movl %edx, %ecx
+       shll $16, %edx          /* Now c|c|0|0 */
+       movw %cx, %dx           /* And finally c|c|c|c */
+
+       /* Better performance can be achieved if the word (32
+          bit) memory access is aligned on a four-byte-boundary.
+          So process first bytes one by one until boundary is
+          reached. Don't use a loop for better performance.  */
+
+       testb $3, %eax          /* correctly aligned ? */
+       je L2                   /* yes => begin loop */
+       cmpb %dl, (%eax)        /* compare byte */
+       je L9                   /* target found => return */
+       incl %eax               /* increment source pointer */
+       decl %esi               /* decrement length counter */
+       je L4                   /* len==0 => return NULL */
+
+       testb $3, %eax          /* correctly aligned ? */
+       je L2                   /* yes => begin loop */
+       cmpb %dl, (%eax)        /* compare byte */
+       je L9                   /* target found => return */
+       incl %eax               /* increment source pointer */
+       decl %esi               /* decrement length counter */
+       je L4                   /* len==0 => return NULL */
+
+       testb $3, %eax          /* correctly aligned ? */
+       je L2                   /* yes => begin loop */
+       cmpb %dl, (%eax)        /* compare byte */
+       je L9                   /* target found => return */
+       incl %eax               /* increment source pointer */
+       decl %esi               /* decrement length counter */
+       /* no test for len==0 here, because this is done in the
+          loop head */
+       jmp L2
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+        change any of the hole bits of LONGWORD.
+
+        1) Is this safe?  Will it catch all the zero bytes?
+        Suppose there is a byte with all zeros.  Any carry bits
+        propagating from its left will fall into the hole at its
+        least significant bit and stop.  Since there will be no
+        carry from its most significant bit, the LSB of the
+        byte to the left will be unchanged, and the zero will be
+        detected.
+
+        2) Is this worthwhile?  Will it ignore everything except
+        zero bytes?  Suppose every byte of LONGWORD has a bit set
+        somewhere.  There will be a carry into bit 8.  If bit 8
+        is set, this will carry into bit 16.  If bit 8 is clear,
+        one of bits 9-15 must be set, so there will be a carry
+        into bit 16.  Similarly, there will be a carry into bit
+        24.  If one of bits 24-31 is set, there will be a carry
+        into bit 32 (=carry flag), so all of the hole bits will
+        be changed.
+
+        3) But wait!  Aren't we looking for C, not zero?
+        Good point.  So what we do is XOR LONGWORD with a longword,
+        each of whose bytes is C.  This turns each byte that is C
+        into a zero.  */
+
+
+       /* Each round the main loop processes 16 bytes.  */
+
+       ALIGN (4)
+
+L1:    movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+
+       /* According to the algorithm we had to reverse the effect of the
+          XOR first and then test the overflow bits.  But because the
+          following XOR would destroy the carry flag and it would (in a
+          representation with more than 32 bits) not alter then last
+          overflow, we can now test this condition.  If no carry is signaled
+          no overflow must have occured in the last byte => it was 0.  */
+       jnc L8
+
+       /* We are only interested in carry bits that change due to the
+          previous add, so remove original bits */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+
+       /* Now test for the other three overflow bits.  */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+
+       /* If at least one byte of the word is C we don't get 0 in %edi.  */
+       jnz L8                  /* found it => return pointer */
+
+       /* This process is unfolded four times for better performance.
+          we don't increment the source pointer each time.  Instead we
+          use offsets and increment by 16 in each run of the loop.  But
+          before probing for the matching byte we need some extra code
+          (following LL(13) below).  Even the len can be compared with
+          constants instead of decrementing each time.  */
+
+       movl 4(%eax), %ecx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L7                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L7                  /* found it => return pointer */
+
+       movl 8(%eax), %ecx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L6                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L6                  /* found it => return pointer */
+
+       movl 12(%eax), %ecx     /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L5                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L5                  /* found it => return pointer */
+
+       /* Adjust both counters for a full round, i.e. 16 bytes.  */
+       addl $16, %eax
+L2:    subl $16, %esi
+       jae L1                  /* Still more than 16 bytes remaining */
+
+       /* Process remaining bytes separately.  */
+       cmpl $4-16, %esi        /* rest < 4 bytes? */
+       jb L3                   /* yes, than test byte by byte */
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L8                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jne L8                  /* found it => return pointer */
+       addl $4, %eax           /* adjust source pointer */
+
+       cmpl $8-16, %esi        /* rest < 8 bytes? */
+       jb L3                   /* yes, than test byte by byte */
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L8                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jne L8                  /* found it => return pointer */
+       addl $4, %eax           /* adjust source pointer */
+
+       cmpl $12-16, %esi       /* rest < 12 bytes? */
+       jb L3                   /* yes, than test byte by byte */
+
+       movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L8                  /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jne L8                  /* found it => return pointer */
+       addl $4, %eax           /* adjust source pointer */
+
+       /* Check the remaining bytes one by one.  */
+L3:    andl $3, %esi           /* mask out uninteresting bytes */
+       jz L4                   /* no remaining bytes => return NULL */
+
+       cmpb %dl, (%eax)        /* compare byte with C */
+       je L9                   /* equal, than return pointer */
+       incl %eax               /* increment source pointer */
+       decl %esi               /* decrement length */
+       jz L4                   /* no remaining bytes => return NULL */
+
+       cmpb %dl, (%eax)        /* compare byte with C */
+       je L9                   /* equal, than return pointer */
+       incl %eax               /* increment source pointer */
+       decl %esi               /* decrement length */
+       jz L4                   /* no remaining bytes => return NULL */
+
+       cmpb %dl, (%eax)        /* compare byte with C */
+       je L9                   /* equal, than return pointer */
+
+L4:    /* no byte found => return NULL */
+       xorl %eax, %eax
+       jmp L9
+
+       /* add missing source pointer increments */
+L5:    addl $4, %eax
+L6:    addl $4, %eax
+L7:    addl $4, %eax
+
+       /* Test for the matching byte in the word.  %ecx contains a NUL
+          char in the byte which originally was the byte we are looking
+          at.  */
+L8:    testb %cl, %cl          /* test first byte in dword */
+       jz L9                   /* if zero => return pointer */
+       incl %eax               /* increment source pointer */
+
+       testb %ch, %ch          /* test second byte in dword */
+       jz L9                   /* if zero => return pointer */
+       incl %eax               /* increment source pointer */
+
+       testl $0xff0000, %ecx   /* test third byte in dword */
+       jz L9                   /* if zero => return pointer */
+       incl %eax               /* increment source pointer */
+
+       /* No further test needed we we known it is one of the four byytes.  */
+
+L9:    popl %edi               /* pop saved registers */
+       popl %esi
+
+       ret
diff --git a/sysdeps/i386/memchr.c b/sysdeps/i386/memchr.c

deleted file mode 100644 (file)

index ff0f8d9..0000000
--- a/sysdeps/i386/memchr.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
-   than N.
-   For Intel 80x86, x>=3.
-   Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
-
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
-
-#include <ansidecl.h>
-#include <string.h>
-
-#ifdef __GNUC__
-
-PTR
-DEFUN(memchr, (str, c, len),
-      CONST PTR str AND int c AND size_t len)
-{
-  PTR retval;
-  asm("cld\n"                  /* Search forward.  */
-      "testl %1,%1\n"          /* Clear Z flag, to handle LEN == 0.  */
-      /* Some old versions of gas need `repne' instead of `repnz'.  */
-      "repnz\n"                        /* Search for C in al.  */
-      "scasb\n"
-      "movl %2,%0\n"           /* Set %0 to 0 (without affecting Z flag).  */
-      "jnz done\n"             /* Jump if we found nothing equal to C.  */
-      "leal -1(%1),%0\n"       /* edi has been incremented.  Return edi-1.  */
-      "done:" :
-      "=a" (retval), "=D" (str), "=c" (len) :
-      "0" (c), "1" (str), "2" (len));
-  return retval;
-}
-
-#else
-#include <sysdeps/generic/memchr.c>
-#endif
diff --git a/sysdeps/i386/memcmp.S b/sysdeps/i386/memcmp.S

new file mode 100644 (file)

index 0000000..f16b44a
--- /dev/null
+++ b/sysdeps/i386/memcmp.S
@@ -0,0 +1,68 @@
+/* memcmp -- compare two memory blocks for differences in the first COUNT
+            bytes.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   block1      (sp + 4)
+   block2      (sp + 8)
+   len         (sp + 12)
+*/
+
+       .text
+ENTRY (memcmp)
+       pushl %esi              /* Save callee-safe registers.  */
+       movl %edi, %edx         /* Note that %edx is not used and can
+                                  so be used to save %edi.  It's faster.  */
+
+       movl 12(%esp), %esi     /* Load address of block #1.  */
+       movl 16(%esp), %edi     /* Load address of block #2.  */
+       movl 20(%esp), %ecx     /* Load maximal length of compare area.  */
+
+       cld                     /* Set direction of comparison.  */
+
+       xorl %eax, %eax         /* Default result.  */
+
+       repe                    /* Compare at most %ecx bytes.  */
+       cmpsb
+       jz L1                   /* If even last byte was equal we return 0.  */
+
+       /* The memory blocks are not equal.  So result of the last
+          subtraction is present in the carry flag.  It is set when
+          the byte in block #2 is bigger.  In this case we have to
+          return -1 (=0xffffffff), else 1.  */
+       sbbl %eax, %eax         /* This is tricky.  %eax == 0 and carry is set
+                                  or not depending on last subtraction.  */
+
+       /* At this point %eax == 0, if the byte of block #1 was bigger, and
+          0xffffffff if the last byte of block #2 was bigger.  The later
+          case is already correct but the former needs a little adjustment.
+          Note that the following operation does not change 0xffffffff.  */
+       orb $1, %al             /* Change 0 to 1.  */
+
+L1:    popl %esi               /* Restore registers.  */
+       movl %edx, %edi
+
+       ret
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/i386/stpcpy.S b/sysdeps/i386/stpcpy.S

new file mode 100644 (file)

index 0000000..f38a908
--- /dev/null
+++ b/sysdeps/i386/stpcpy.S
@@ -0,0 +1,87 @@
+/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0'
+            in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+   also not invented here.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest                (sp + 4)
+   src         (sp + 8)
+*/
+
+       .text
+ENTRY (__stpcpy)
+       movl 4(%esp), %eax      /* load destination pointer */
+       movl 8(%esp), %ecx      /* load source pointer */
+
+       subl %eax, %ecx         /* magic: reduce number of loop variants
+                                  to one using addressing mode */
+
+       /* Here we would like to write
+
+       subl $4, %eax
+       ALIGN (4)
+
+       but the assembler is too smart and optimizes for the shortest
+       form where the number only needs one byte.  But if we could
+       have the long form we would not need the alignment.  */
+
+       .byte 0x81, 0xe8        /* This is `subl $0x00000004, %eax' */
+       .long 0x00000004
+
+       /* Four times unfolded loop with only one loop counter.  This
+          is achieved by the use of index+base adressing mode.  As the
+          loop counter we use the destination address because this is
+          also the result.  */
+L1:    addl $4, %eax           /* increment loop counter */
+
+       movb (%eax,%ecx), %dl   /* load current char */
+       movb %dl, (%eax)        /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L2                   /* yes, then exit */
+
+       movb 1(%eax,%ecx), %dl  /* load current char */
+       movb %dl, 1(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L3                   /* yes, then exit */
+
+       movb 2(%eax,%ecx), %dl  /* load current char */
+       movb %dl, 2(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L4                   /* yes, then exit */
+
+       movb 3(%eax,%ecx), %dl  /* load current char */
+       movb %dl, 3(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jnz L1                  /* no, then continue loop */
+
+       incl %eax               /* correct loop counter */
+L4:    incl %eax
+L3:    incl %eax
+L2:
+       ret
+
+weak_alias (__stpcpy, stpcpy)
diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S

new file mode 100644 (file)

index 0000000..59192e6
--- /dev/null
+++ b/sysdeps/i386/stpncpy.S
@@ -0,0 +1,143 @@
+/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the
+             address of the terminating '\0' in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+  - original wrote n+1 chars in some cases.
+  - stpncpy() ought to behave like strncpy() ie. not null-terminate
+    if limited by n.  glibc-1.09 stpncpy() does this.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest                (sp + 4)
+   src         (sp + 8)
+   maxlen      (sp + 12)
+*/
+
+       .text
+ENTRY (__stpncpy)
+
+       pushl %esi
+
+       movl 8(%esp), %eax      /* load destination pointer */
+       movl 12(%esp), %esi     /* load source pointer */
+       movl 16(%esp), %ecx     /* load maximal length */
+
+       subl %eax, %esi         /* magic: reduce number of loop variants
+                                  to one using addressing mode */
+       jmp L1                  /* jump to loop "head" */
+
+       ALIGN(4)
+
+       /* Four times unfolded loop with two loop counters.  We get the
+          the third value (the source address) by using the index+base
+          adressing mode.  */
+L2:    movb (%eax,%esi), %dl   /* load current char */
+       movb %dl, (%eax)        /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L7                   /* yes, then exit */
+
+       movb 1(%eax,%esi), %dl  /* load current char */
+       movb %dl, 1(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L6                   /* yes, then exit */
+
+       movb 2(%eax,%esi), %dl  /* load current char */
+       movb %dl, 2(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L5                   /* yes, then exit */
+
+       movb 3(%eax,%esi), %dl  /* load current char */
+       movb %dl, 3(%eax)       /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L4                   /* yes, then exit */
+
+       addl $4, %eax           /* increment loop counter for full round */
+
+L1:    subl $4, %ecx           /* still more than 4 bytes allowed? */
+       jae L2                  /* yes, then go to start of loop */
+
+       /* The maximal remaining 15 bytes are not processed in a loop.  */
+
+       addl $4, %ecx           /* correct above subtraction */
+       jz L9                   /* maximal allowed char reached => go to end */
+
+       movb (%eax,%esi), %dl   /* load current char */
+       movb %dl, (%eax)        /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L3                   /* yes, then exit */
+
+       incl %eax               /* increment pointer */
+       decl %ecx               /* decrement length counter */
+       jz L9                   /* no more allowed => exit */
+
+       movb (%eax,%esi), %dl   /* load current char */
+       movb %dl, (%eax)        /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L3                   /* yes, then exit */
+
+       incl %eax               /* increment pointer */
+       decl %ecx               /* decrement length counter */
+       jz L9                   /* no more allowed => exit */
+
+       movb (%eax,%esi), %dl   /* load current char */
+       movb %dl, (%eax)        /* and store it */
+       testb %dl, %dl          /* was it NUL? */
+       jz L3                   /* yes, then exit */
+
+       incl %eax               /* increment pointer */
+       jmp L9                  /* we don't have to test for counter underflow
+                                  because we know we had a most 3 bytes
+                                  remaining => exit */
+
+       /* When coming from the main loop we have to adjust the pointer.  */
+L4:    decl %ecx               /* decrement counter */
+       incl %eax               /* increment pointer */
+
+L5:    decl %ecx               /* increment pointer */
+       incl %eax               /* increment pointer */
+
+L6:    decl %ecx               /* increment pointer */
+       incl %eax               /* increment pointer */
+L7:
+
+       addl $3, %ecx           /* correct pre-decrementation of counter
+                                  at the beginning of the loop; but why 3
+                                  and not 4?  Very simple, we have to count
+                                  the NUL char we already wrote.  */
+       jz L9                   /* counter is also 0 => exit */
+
+       /* We now have to fill the rest of the buffer with NUL.  This
+          is done in a tricky way.  Please note that the adressing mode
+          used below is not the same we used above.  Here we use the
+          %ecx register.  */
+L8:
+       movb $0, (%ecx,%eax)    /* store NUL char */
+L3:    decl %ecx               /* all bytes written? */
+       jnz L8                  /* no, then again */
+
+L9:    popl %esi               /* restore saved register content */
+
+       ret
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/i386/strchr.S b/sysdeps/i386/strchr.S

new file mode 100644 (file)

index 0000000..de947cd
--- /dev/null
+++ b/sysdeps/i386/strchr.S
@@ -0,0 +1,278 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   ch          (sp + 8)
+*/
+
+       .text
+ENTRY (strchr)
+       pushl %edi              /* Save callee-safe registers used here.  */
+
+       movl 8(%esp), %eax      /* get string pointer */
+       movl 12(%esp), %edx     /* get character we are looking for */
+
+       /* At the moment %edx contains C.  What we need for the
+          algorithm is C in all bytes of the dword.  Avoid
+          operations on 16 bit words because these require an
+          prefix byte (and one more cycle).  */
+       movb %dl, %dh           /* now it is 0|0|c|c */
+       movl %edx, %ecx
+       shll $16, %edx          /* now it is c|c|0|0 */
+       movw %cx, %dx           /* and finally c|c|c|c */
+
+       /* Before we start with the main loop we process single bytes
+          until the source pointer is aligned.  This has two reasons:
+          1. aligned 32-bit memory access is faster
+          and (more important)
+          2. we process in the main loop 32 bit in one step although
+             we don't know the end of the string.  But accessing at
+             4-byte alignment guarantees that we never access illegal
+             memory if this would not also be done by the trivial
+             implementation (this is because all processor inherant
+             boundaries are multiples of 4.  */
+
+       testb $3, %eax          /* correctly aligned ? */
+       jz L11                  /* yes => begin loop */
+       movb (%eax), %cl        /* load byte in question (we need it twice) */
+       cmpb %cl, %dl           /* compare byte */
+       je L6                   /* target found => return */
+       testb %cl, %cl          /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %eax               /* increment pointer */
+
+       testb $3, %eax          /* correctly aligned ? */
+       jz L11                  /* yes => begin loop */
+       movb (%eax), %cl        /* load byte in question (we need it twice) */
+       cmpb %cl, %dl           /* compare byte */
+       je L6                   /* target found => return */
+       testb %cl, %cl          /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %eax               /* increment pointer */
+
+       testb $3, %eax          /* correctly aligned ? */
+       jz L11                  /* yes => begin loop */
+       movb (%eax), %cl        /* load byte in question (we need it twice) */
+       cmpb %cl, %dl           /* compare byte */
+       je L6                   /* target found => return */
+       testb %cl, %cl          /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %eax               /* increment pointer */
+
+       /* No we have reached alignment.  */
+       jmp L11                 /* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+        change any of the hole bits of LONGWORD.
+
+        1) Is this safe?  Will it catch all the zero bytes?
+        Suppose there is a byte with all zeros.  Any carry bits
+        propagating from its left will fall into the hole at its
+        least significant bit and stop.  Since there will be no
+        carry from its most significant bit, the LSB of the
+        byte to the left will be unchanged, and the zero will be
+        detected.
+
+        2) Is this worthwhile?  Will it ignore everything except
+        zero bytes?  Suppose every byte of LONGWORD has a bit set
+        somewhere.  There will be a carry into bit 8.  If bit 8
+        is set, this will carry into bit 16.  If bit 8 is clear,
+        one of bits 9-15 must be set, so there will be a carry
+        into bit 16.  Similarly, there will be a carry into bit
+        24.  If one of bits 24-31 is set, there will be a carry
+        into bit 32 (=carry flag), so all of the hole bits will
+        be changed.
+
+        3) But wait!  Aren't we looking for C, not zero?
+        Good point.  So what we do is XOR LONGWORD with a longword,
+        each of whose bytes is C.  This turns each byte that is C
+        into a zero.  */
+
+       /* Each round the main loop processes 16 bytes.  */
+
+       ALIGN(4)
+
+L1:    addl $16, %eax          /* adjust pointer for whole round */
+
+L11:   movl (%eax), %ecx       /* get word (= 4 bytes) in question */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* C */
+
+       /* According to the algorithm we had to reverse the effect of the
+          XOR first and then test the overflow bits.  But because the
+          following XOR would destroy the carry flag and it would (in a
+          representation with more than 32 bits) not alter then last
+          overflow, we can now test this condition.  If no carry is signaled
+          no overflow must have occured in the last byte => it was 0.  */
+       jnc L7
+
+       /* We are only interested in carry bits that change due to the
+          previous add, so remove original bits */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+
+       /* Now test for the other three overflow bits.  */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+
+       /* If at least one byte of the word is C we don't get 0 in %edi.  */
+       jnz L7                  /* found it => return pointer */
+
+       /* Now we made sure the dword does not contain the character we are
+          looking for.  But because we deal with strings we have to check
+          for the end of string before testing the next dword.  */
+
+       xorl %edx, %ecx         /* restore original dword without reload */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L2                  /* highest byte is NUL => return NULL */
+       xorl %ecx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L2                  /* found NUL => return NULL */
+
+       movl 4(%eax), %ecx      /* get word (= 4 bytes) in question */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* C */
+       jnc L71                 /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L71                 /* found it => return pointer */
+       xorl %edx, %ecx         /* restore original dword without reload */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L2                  /* highest byte is NUL => return NULL */
+       xorl %ecx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L2                  /* found NUL => return NULL */
+
+       movl 8(%eax), %ecx      /* get word (= 4 bytes) in question */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* C */
+       jnc L72                 /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L72                 /* found it => return pointer */
+       xorl %edx, %ecx         /* restore original dword without reload */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L2                  /* highest byte is NUL => return NULL */
+       xorl %ecx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L2                  /* found NUL => return NULL */
+
+       movl 12(%eax), %ecx     /* get word (= 4 bytes) in question */
+       xorl %edx, %ecx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* C */
+       jnc L73                 /* highest byte is C => return pointer */
+       xorl %ecx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L73                 /* found it => return pointer */
+       xorl %edx, %ecx         /* restore original dword without reload */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %ecx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L2                  /* highest byte is NUL => return NULL */
+       xorl %ecx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jz L1                   /* no NUL found => restart loop */
+
+L2:    /* Return NULL.  */
+       xorl %eax, %eax         /* load NULL in return value register */
+       popl %edi               /* restore saved register content */
+       ret
+
+L73:   addl $4, %eax           /* adjust pointer */
+L72:   addl $4, %eax
+L71:   addl $4, %eax
+
+       /* We now scan for the byte in which the character was matched.
+          But we have to take care of the case that a NUL char is
+          found before this in the dword.  */
+
+L7:    testb %cl, %cl          /* is first byte C? */
+       jz L6                   /* yes => return pointer */
+       cmpb %dl, %cl           /* is first byte NUL? */
+       je L2                   /* yes => return NULL */
+       incl %eax               /* it's not in the first byte */
+
+       testb %ch, %ch          /* is second byte C? */
+       jz L6                   /* yes => return pointer */
+       cmpb %dl, %ch           /* is second byte NUL? */
+       je L2                   /* yes => return NULL? */
+       incl %eax               /* it's not in the second byte */
+
+       shrl $16, %ecx          /* make upper byte accessible */
+       testb %cl, %cl          /* is third byte C? */
+       jz L6                   /* yes => return pointer */
+       cmpb %dl, %cl           /* is third byte NUL? */
+       je L2                   /* yes => return NULL */
+
+       /* It must be in the fourth byte and it cannot be NUL.  */
+       incl %eax
+
+L6:    popl %edi               /* restore saved register content */
+
+       ret
+
+weak_alias (strchr, index)
diff --git a/sysdeps/i386/strcspn.S b/sysdeps/i386/strcspn.S

new file mode 100644 (file)

index 0000000..b0e789b
--- /dev/null
+++ b/sysdeps/i386/strcspn.S
@@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+                       which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   stopset     (sp + 8)
+*/
+
+       .text
+ENTRY (strcspn)
+       movl 4(%esp), %edx      /* get string pointer */
+       movl 8(%esp), %eax      /* get stopset pointer */
+
+       /* First we create a table with flags for all possible characters.
+          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+          supported by the C string functions we have 256 characters.
+          Before inserting marks for the stop characters we clear the whole
+          table.  The unrolled form is much faster than a loop.  */
+       xorl %ecx, %ecx         /* %ecx = 0 !!! */
+
+       pushl %ecx              /* make a 256 bytes long block filled with 0 */
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl $0                /* These immediate values make the label 2 */
+       pushl $0                /* to be aligned on a 16 byte boundary to */
+       pushl $0                /* get a better performance of the loop.  */
+       pushl $0
+       pushl $0
+       pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:    movb (%eax), %cl        /* get byte from stopset */
+       testb %cl, %cl          /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 1(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 2(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 3(%eax), %cl       /* get byte from stopset */
+       addl $4, %eax           /* increment stopset pointer */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+       testb $0xff, %cl        /* is NUL char? */
+       jnz L2                  /* no => process next dword from stopset */
+
+L1:    leal -4(%edx), %eax     /* prepare loop */
+
+       /* We use a neat trick for the following loop.  Normally we would
+          have to test for two termination conditions
+          1. a character in the stopset was found
+          and
+          2. the end of the string was found
+          But as a sign that the chracter is in the stopset we store its
+          value in the table.  But the value of NUL is NUL so the loop
+          terminates for NUL in every case.  */
+
+L3:    addl $4, %eax           /* adjust pointer for full loop round */
+
+       movb (%eax), %cl        /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L4                   /* yes => return */
+
+       movb 1(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L5                   /* yes => return */
+
+       movb 2(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L6                   /* yes => return */
+
+       movb 3(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       jne L3                  /* yes => return */
+
+       incl %eax               /* adjust pointer */
+L6:    incl %eax
+L5:    incl %eax
+
+L4:    subl %edx, %eax         /* we have to return the number of valid
+                                  characters, so compute distance to first
+                                  non-valid character */
+       addl $256, %esp         /* remove stopset */
+
+       ret
diff --git a/sysdeps/i386/strpbrk.S b/sysdeps/i386/strpbrk.S

new file mode 100644 (file)

index 0000000..245bf1a
--- /dev/null
+++ b/sysdeps/i386/strpbrk.S
@@ -0,0 +1,177 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+                       which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   stopset     (sp + 8)
+*/
+
+       .text
+ENTRY (strpbrk)
+       movl 4(%esp), %edx      /* get string pointer */
+       movl 8(%esp), %eax      /* get stopset pointer */
+
+       /* First we create a table with flags for all possible characters.
+          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+          supported by the C string functions we have 256 characters.
+          Before inserting marks for the stop characters we clear the whole
+          table.  The unrolled form is much faster than a loop.  */
+       xorl %ecx, %ecx         /* %ecx = 0 !!! */
+
+       pushl %ecx              /* make a 256 bytes long block filled with 0 */
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl $0                /* These immediate values make the label 2 */
+       pushl $0                /* to be aligned on a 16 byte boundary to */
+       pushl $0                /* get a better performance of the loop.  */
+       pushl $0
+       pushl $0
+       pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:    movb (%eax), %cl        /* get byte from stopset */
+       testb %cl, %cl          /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 1(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 2(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 3(%eax), %cl       /* get byte from stopset */
+       addl $4, %eax           /* increment stopset pointer */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+       testb $0xff, %cl        /* is NUL char? */
+       jnz L2                  /* no => process next dword from stopset */
+
+L1:    leal -4(%edx), %eax     /* prepare loop */
+
+       /* We use a neat trick for the following loop.  Normally we would
+          have to test for two termination conditions
+          1. a character in the stopset was found
+          and
+          2. the end of the string was found
+          But as a sign that the chracter is in the stopset we store its
+          value in the table.  But the value of NUL is NUL so the loop
+          terminates for NUL in every case.  */
+
+L3:    addl $4, %eax           /* adjust pointer for full loop round */
+
+       movb (%eax), %cl        /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L4                   /* yes => return */
+
+       movb 1(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L5                   /* yes => return */
+
+       movb 2(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       je L6                   /* yes => return */
+
+       movb 3(%eax), %cl       /* get byte from string */
+       cmpb %cl, (%esp,%ecx)   /* is it contained in stopset? */
+       jne L3                  /* yes => return */
+
+       incl %eax               /* adjust pointer */
+L6:    incl %eax
+L5:    incl %eax
+
+L4:    addl $256, %esp         /* remove stopset */
+
+       orb %cl, %cl            /* was last character NUL? */
+       jnz L7                  /* no => return pointer */
+       xorl %eax, %eax         /* return NULL */
+
+L7:    ret
diff --git a/sysdeps/i386/strrchr.S b/sysdeps/i386/strrchr.S

new file mode 100644 (file)

index 0000000..468a940
--- /dev/null
+++ b/sysdeps/i386/strrchr.S
@@ -0,0 +1,321 @@
+/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   ch          (sp + 8)
+*/
+
+       .text
+ENTRY (strrchr)
+       pushl %edi              /* Save callee-safe registers used here.  */
+       pushl %esi
+
+       xorl %eax, %eax
+       movl 12(%esp), %esi     /* get string pointer */
+       movl 16(%esp), %ecx     /* get character we are looking for */
+
+       /* At the moment %ecx contains C.  What we need for the
+          algorithm is C in all bytes of the dword.  Avoid
+          operations on 16 bit words because these require an
+          prefix byte (and one more cycle).  */
+       movb %cl, %ch           /* now it is 0|0|c|c */
+       movl %ecx, %edx
+       shll $16, %ecx          /* now it is c|c|0|0 */
+       movw %dx, %cx           /* and finally c|c|c|c */
+
+       /* Before we start with the main loop we process single bytes
+          until the source pointer is aligned.  This has two reasons:
+          1. aligned 32-bit memory access is faster
+          and (more important)
+          2. we process in the main loop 32 bit in one step although
+             we don't know the end of the string.  But accessing at
+             4-byte alignment guarantees that we never access illegal
+             memory if this would not also be done by the trivial
+             implementation (this is because all processor inherant
+             boundaries are multiples of 4.  */
+
+       testb $3, %esi          /* correctly aligned ? */
+       jz L19                  /* yes => begin loop */
+       movb (%esi), %dl        /* load byte in question (we need it twice) */
+       cmpb %dl, %cl           /* compare byte */
+       jne L11                 /* target found => return */
+       movl %esi, %eax         /* remember pointer as possible result */
+L11:   orb %dl, %dl            /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %esi               /* increment pointer */
+
+       testb $3, %esi          /* correctly aligned ? */
+       jz L19                  /* yes => begin loop */
+       movb (%esi), %dl        /* load byte in question (we need it twice) */
+       cmpb %dl, %cl           /* compare byte */
+       jne L12                 /* target found => return */
+       movl %esi, %eax         /* remember pointer as result */
+L12:   orb %dl, %dl            /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %esi               /* increment pointer */
+
+       testb $3, %esi          /* correctly aligned ? */
+       jz L19                  /* yes => begin loop */
+       movb (%esi), %dl        /* load byte in question (we need it twice) */
+       cmpb %dl, %cl           /* compare byte */
+       jne L13                 /* target found => return */
+       movl %esi, %eax         /* remember pointer as result */
+L13:   orb %cl, %cl            /* is NUL? */
+       jz L2                   /* yes => return NULL */
+       incl %esi               /* increment pointer */
+
+       /* No we have reached alignment.  */
+       jmp L19                 /* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+        change any of the hole bits of LONGWORD.
+
+        1) Is this safe?  Will it catch all the zero bytes?
+        Suppose there is a byte with all zeros.  Any carry bits
+        propagating from its left will fall into the hole at its
+        least significant bit and stop.  Since there will be no
+        carry from its most significant bit, the LSB of the
+        byte to the left will be unchanged, and the zero will be
+        detected.
+
+        2) Is this worthwhile?  Will it ignore everything except
+        zero bytes?  Suppose every byte of LONGWORD has a bit set
+        somewhere.  There will be a carry into bit 8.  If bit 8
+        is set, this will carry into bit 16.  If bit 8 is clear,
+        one of bits 9-15 must be set, so there will be a carry
+        into bit 16.  Similarly, there will be a carry into bit
+        24.  If one of bits 24-31 is set, there will be a carry
+        into bit 32 (=carry flag), so all of the hole bits will
+        be changed.
+
+        3) But wait!  Aren't we looking for C, not zero?
+        Good point.  So what we do is XOR LONGWORD with a longword,
+        each of whose bytes is C.  This turns each byte that is C
+        into a zero.  */
+
+       /* Each round the main loop processes 16 bytes.  */
+
+       /* Jump to here when the character is detected.  We chose this
+          way around because the character one is looking for is not
+          as frequent as the rest and taking a conditional jump is more
+          expensive than ignoring it.
+
+          Some more words to the code below: it might not be obvious why
+          we decrement the source pointer here.  In the loop the pointer
+          is not pre-incremented and so it still points before the word
+          we are looking at.  But you should take a look at the instruction
+          which gets executed before we get into the loop: `addl $16, %esi'.
+          This makes the following subs into adds.  */
+
+       /* These fill bytes make the main loop be correctly aligned.
+          We cannot use align because it is not the following instruction
+          which should be aligned.  */
+       .byte 0, 0, 0, 0, 0, 0, 0, 0
+
+L4:    subl $4, %esi           /* adjust pointer */
+L41:   subl $4, %esi
+L42:   subl $4, %esi
+L43:   testl $0xff000000, %edx /* is highest byte == C? */
+       jnz L33                 /* no => try other bytes */
+       leal 15(%esi), %eax     /* store address as result */
+       jmp L1                  /* and start loop again */
+
+L3:    subl $4, %esi           /* adjust pointer */
+L31:   subl $4, %esi
+L32:   subl $4, %esi
+L33:   testl $0xff0000, %edx   /* is C in third byte? */
+       jnz L51                 /* no => try other bytes */
+       leal 14(%esi), %eax     /* store address as result */
+       jmp L1                  /* and start loop again */
+
+L51:
+       /* At this point we know that the byte is in one of the lower bytes.
+          We make a guess and correct it if necessary.  This reduces the
+          number of necessary jumps.  */
+       leal 12(%esi), %eax     /* guess address of lowest byte as result */
+       testb %dh, %dh          /* is guess correct? */
+       jnz L1                  /* yes => start loop */
+       leal 13(%esi), %eax     /* correct guess to second byte */
+
+L1:    addl $16, %esi          /* increment pointer for full round */
+
+L19:   movl (%esi), %edx       /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+
+       /* According to the algorithm we had to reverse the effect of the
+          XOR first and then test the overflow bits.  But because the
+          following XOR would destroy the carry flag and it would (in a
+          representation with more than 32 bits) not alter then last
+          overflow, we can now test this condition.  If no carry is signaled
+          no overflow must have occured in the last byte => it was 0.  */
+
+       jnc L20                 /* found NUL => check last word */
+
+       /* We are only interested in carry bits that change due to the
+          previous add, so remove original bits */
+       xorl %edx, %edi         /* (word+magic)^word */
+
+       /* Now test for the other three overflow bits.  */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+
+       /* If at least one byte of the word is C we don't get 0 in %edi.  */
+       jnz L20                 /* found NUL => check last word */
+
+       /* Now we made sure the dword does not contain the character we are
+          looking for.  But because we deal with strings we have to check
+          for the end of string before testing the next dword.  */
+
+       xorl %ecx, %edx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L4                  /* highest byte is C => examine dword */
+       xorl %edx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L3                  /* C is detected in the word => examine it */
+
+       movl 4(%esi), %edx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L21                 /* found NUL => check last word */
+       xorl %edx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L21                 /* found NUL => check last word */
+       xorl %ecx, %edx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L41                 /* highest byte is C => examine dword */
+       xorl %edx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L31                 /* C is detected in the word => examine it */
+
+       movl 8(%esi), %edx      /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L22                 /* found NUL => check last word */
+       xorl %edx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L22                 /* found NUL => check last word */
+       xorl %ecx, %edx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L42                 /* highest byte is C => examine dword */
+       xorl %edx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L32                 /* C is detected in the word => examine it */
+
+       movl 12(%esi), %edx     /* get word (= 4 bytes) in question */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L23                 /* found NUL => check last word */
+       xorl %edx, %edi         /* (word+magic)^word */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jnz L23                 /* found NUL => check last word */
+       xorl %ecx, %edx         /* XOR with word c|c|c|c => bytes of str == c
+                                  are now 0 */
+       movl $0xfefefeff, %edi  /* magic value */
+       addl %edx, %edi         /* add the magic value to the word.  We get
+                                  carry bits reported for each byte which
+                                  is *not* 0 */
+       jnc L43                 /* highest byte is C => examine dword */
+       xorl %edx, %edi         /* ((word^charmask)+magic)^(word^charmask) */
+       orl $0xfefefeff, %edi   /* set all non-carry bits */
+       incl %edi               /* add 1: if one carry bit was *not* set
+                                  the addition will not result in 0.  */
+       jz L1                   /* C is not detected => restart loop */
+       jmp L33                 /* examine word */
+
+L23:   addl $4, %esi           /* adjust pointer */
+L22:   addl $4, %esi
+L21:   addl $4, %esi
+
+       /* What remains to do is to test which byte the NUL char is and
+          whether the searched character appears in one of the bytes
+          before.  A special case is that the searched byte maybe NUL.
+          In this case a pointer to the terminating NUL char has to be
+          returned.  */
+
+L20:   cmpb %cl, %dl           /* is first byte == C? */
+       jne L24                 /* no => skip */
+       movl %esi, %eax         /* store address as result */
+L24:   testb %dl, %dl          /* is first byte == NUL? */
+       jz L2                   /* yes => return */
+
+       cmpb %cl, %dh           /* is second byte == C? */
+       jne L25                 /* no => skip */
+       leal 1(%esi), %eax      /* store address as result */
+L25:   testb %dh, %dh          /* is second byte == NUL? */
+       jz L2                   /* yes => return */
+
+       shrl $16,%edx           /* make upper bytes accessible */
+       cmpb %cl, %dl           /* is third byte == C */
+       jne L26                 /* no => skip */
+       leal 2(%esi), %eax      /* store address as result */
+L26:   testb %dl, %dl          /* is third byte == NUL */
+       jz L2                   /* yes => return */
+
+       cmpb %cl, %dh           /* is fourth byte == C */
+       jne L2                  /* no => skip */
+       leal 3(%esi), %eax      /* store address as result */
+
+L2:    popl %esi               /* restore saved register content */
+       popl %edi
+
+       ret
+
+weak_alias (strrchr, rindex)
diff --git a/sysdeps/i386/strspn.S b/sysdeps/i386/strspn.S

new file mode 100644 (file)

index 0000000..1a02026
--- /dev/null
+++ b/sysdeps/i386/strspn.S
@@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+                       which contains only characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str         (sp + 4)
+   skipset     (sp + 8)
+*/
+
+       .text
+ENTRY (strspn)
+       movl 4(%esp), %edx      /* get string pointer */
+       movl 8(%esp), %eax      /* get skipset pointer */
+
+       /* First we create a table with flags for all possible characters.
+          For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+          supported by the C string functions we have 256 characters.
+          Before inserting marks for the stop characters we clear the whole
+          table.  The unrolled form is much faster than a loop.  */
+       xorl %ecx, %ecx         /* %ecx = 0 !!! */
+
+       pushl %ecx              /* make a 256 bytes long block filled with 0 */
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl %ecx
+       pushl $0                /* These immediate values make the label 2 */
+       pushl $0                /* to be aligned on a 16 byte boundary to */
+       pushl $0                /* get a better performance of the loop.  */
+       pushl $0
+       pushl $0
+       pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:    movb (%eax), %cl        /* get byte from stopset */
+       testb %cl, %cl          /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 1(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 2(%eax), %cl       /* get byte from stopset */
+       testb $0xff, %cl        /* is NUL char? */
+       jz L1                   /* yes => start compare loop */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+
+       movb 3(%eax), %cl       /* get byte from stopset */
+       addl $4, %eax           /* increment stopset pointer */
+       movb %cl, (%esp,%ecx)   /* set corresponding byte in stopset table */
+       testb $0xff, %cl        /* is NUL char? */
+       jnz L2                  /* no => process next dword from stopset */
+
+L1:    leal -4(%edx), %eax     /* prepare loop */
+
+       /* We use a neat trick for the following loop.  Normally we would
+          have to test for two termination conditions
+          1. a character in the stopset was found
+          and
+          2. the end of the string was found
+          But as a sign that the chracter is in the stopset we store its
+          value in the table.  But the value of NUL is NUL so the loop
+          terminates for NUL in every case.  */
+
+L3:    addl $4, %eax           /* adjust pointer for full loop round */
+
+       movb (%eax), %cl        /* get byte from string */
+       testb %cl, (%esp,%ecx)  /* is it contained in skipset? */
+       jz L4                   /* no => return */
+
+       movb 1(%eax), %cl       /* get byte from string */
+       testb %cl, (%esp,%ecx)  /* is it contained in skipset? */
+       jz L5                   /* no => return */
+
+       movb 2(%eax), %cl       /* get byte from string */
+       testb %cl, (%esp,%ecx)  /* is it contained in skipset? */
+       jz L6                   /* no => return */
+
+       movb 3(%eax), %cl       /* get byte from string */
+       testb %cl, (%esp,%ecx)  /* is it contained in skipset? */
+       jnz L3                  /* yes => start loop again */
+
+       incl %eax               /* adjust pointer */
+L6:    incl %eax
+L5:    incl %eax
+
+L4:    subl %edx, %eax         /* we have to return the number of valid
+                                  characters, so compute distance to first
+                                  non-valid character */
+       addl $256, %esp         /* remove stopset */
+
+       ret
diff --git a/sysdeps/i386/sub_n.S b/sysdeps/i386/sub_n.S

index 64d2c2529301819feca20f8d9211f867266ed36f..e18a70885b0660c9565234960e70ec0e0d803b56 100644 (file)
--- a/sysdeps/i386/sub_n.S
+++ b/sysdeps/i386/sub_n.S
@@ -1,7 +1,7 @@
  /* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
     sum in a third limb vector.
  
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
  
  This file is part of the GNU MP Library.
  
@@ -37,10 +37,10 @@ C_SYMBOL_NAME(__mpn_sub_n:)
         pushl %edi
         pushl %esi
  
-       movl 12(%esp),%edi      /* res_ptr */
-       movl 16(%esp),%esi      /* s1_ptr */
-       movl 20(%esp),%edx      /* s2_ptr */
-       movl 24(%esp),%ecx      /* size */
+       movl 12(%esp),%edi              /* res_ptr */
+       movl 16(%esp),%esi              /* s1_ptr */
+       movl 20(%esp),%edx              /* s2_ptr */
+       movl 24(%esp),%ecx              /* size */
  
         movl    %ecx,%eax
         shrl    $3,%ecx                 /* compute count for unrolled loop */
@@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:)
         subl    %eax,%edx               /* ... enter the loop */
         shrl    $2,%eax                 /* restore previous value */
  #ifdef PIC
-       call    here
-here:  leal    (Loop - 3 - here)(%eax,%eax,8),%eax
-       addl    %eax,(%esp)
-       ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+       call    L0
+L0:    leal    (%eax,%eax,8),%eax
+       addl    (%esp),%eax
+       addl    $(Loop-L0-3),%eax 
+       addl    $4,%esp
  #else
-       leal    (Loop - 3)(%eax,%eax,8),%eax    /* calc start addr in loop */
-       jmp     *%eax                   /* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+       leal    (Loop - 3)(%eax,%eax,8),%eax
  #endif
+       jmp     *%eax                   /* jump into loop */
         ALIGN (3)
  Loop:  movl    (%esi),%eax
         sbbl    (%edx),%eax
diff --git a/sysdeps/i960/add_n.s b/sysdeps/i960/add_n.s

new file mode 100644 (file)

index 0000000..6031f6d
--- /dev/null
+++ b/sysdeps/i960/add_n.s
@@ -0,0 +1,21 @@
+.text
+       .align 4
+       .globl ___mpn_add_n
+___mpn_add_n:
+       mov     0,g6            # clear carry-save register
+       cmpo    1,0             # clear cy
+
+Loop:  subo    1,g3,g3         # update loop counter
+       ld      (g1),g5         # load from s1_ptr
+       addo    4,g1,g1         # s1_ptr++
+       ld      (g2),g4         # load from s2_ptr
+       addo    4,g2,g2         # s2_ptr++
+       cmpo    g6,1            # restore cy from g6, relies on cy being 0
+       addc    g4,g5,g4        # main add
+       subc    0,0,g6          # save cy in g6
+       st      g4,(g0)         # store result to res_ptr
+       addo    4,g0,g0         # res_ptr++
+       cmpobne 0,g3,Loop       # when branch is taken, clears C bit
+
+       mov     g6,g0
+       ret
diff --git a/sysdeps/i960/addmul_1.s b/sysdeps/i960/addmul_1.s

new file mode 100644 (file)

index 0000000..1a3de95
--- /dev/null
+++ b/sysdeps/i960/addmul_1.s
@@ -0,0 +1,26 @@
+.text
+       .align  4
+       .globl  ___mpn_mul_1
+___mpn_mul_1:
+       subo    g2,0,g2
+       shlo    2,g2,g4
+       subo    g4,g1,g1
+       subo    g4,g0,g13
+       mov     0,g0
+
+       cmpo    1,0             # clear C bit on AC.cc
+
+Loop:  ld      (g1)[g2*4],g5
+       emul    g3,g5,g6
+       ld      (g13)[g2*4],g5
+
+       addc    g0,g6,g6        # relies on that C bit is clear
+       addc    0,g7,g7
+       addc    g5,g6,g6        # relies on that C bit is clear
+       st      g6,(g13)[g2*4]
+       addc    0,g7,g0
+
+       addo    g2,1,g2
+       cmpobne 0,g2,Loop       # when branch is taken, clears C bit
+
+       ret
diff --git a/sysdeps/i960/mul_1.s b/sysdeps/i960/mul_1.s

new file mode 100644 (file)

index 0000000..e75ea42
--- /dev/null
+++ b/sysdeps/i960/mul_1.s
@@ -0,0 +1,23 @@
+.text
+       .align  4
+       .globl  ___mpn_mul_1
+___mpn_mul_1:
+       subo    g2,0,g2
+       shlo    2,g2,g4
+       subo    g4,g1,g1
+       subo    g4,g0,g13
+       mov     0,g0
+
+       cmpo    1,0             # clear C bit on AC.cc
+
+Loop:  ld      (g1)[g2*4],g5
+       emul    g3,g5,g6
+
+       addc    g0,g6,g6        # relies on that C bit is clear
+       st      g6,(g13)[g2*4]
+       addc    0,g7,g0
+
+       addo    g2,1,g2
+       cmpobne 0,g2,Loop       # when branch is taken, clears C bit
+
+       ret
diff --git a/sysdeps/i960/sub_n.s b/sysdeps/i960/sub_n.s

new file mode 100644 (file)

index 0000000..13ebbfa
--- /dev/null
+++ b/sysdeps/i960/sub_n.s
@@ -0,0 +1,21 @@
+.text
+       .align 4
+       .globl ___mpn_sub_n
+___mpn_sub_n:
+       mov     1,g6            # set carry-save register
+       cmpo    1,0             # clear cy
+
+Loop:  subo    1,g3,g3         # update loop counter
+       ld      (g1),g5         # load from s1_ptr
+       addo    4,g1,g1         # s1_ptr++
+       ld      (g2),g4         # load from s2_ptr
+       addo    4,g2,g2         # s2_ptr++
+       cmpo    g6,1            # restore cy from g6, relies on cy being 0
+       subc    g4,g5,g4        # main subtract
+       subc    0,0,g6          # save cy in g6
+       st      g4,(g0)         # store result to res_ptr
+       addo    4,g0,g0         # res_ptr++
+       cmpobne 0,g3,Loop       # when branch is taken, cy will be 0
+
+       mov     g6,g0
+       ret
diff --git a/sysdeps/m88k/m88100/add_n.s b/sysdeps/m88k/m88100/add_n.s

new file mode 100644 (file)

index 0000000..7e4cccc
--- /dev/null
+++ b/sysdeps/m88k/m88100/add_n.s
@@ -0,0 +1,103 @@
+; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; s2_ptr       r4
+; size         r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+       text
+       align    16
+       global   ___mpn_add_n
+___mpn_add_n:
+       ld      r6,r3,0                 ; read first limb from s1_ptr
+       extu    r10,r5,3
+       ld      r7,r4,0                 ; read first limb from s2_ptr
+
+       subu.co r5,r0,r5                ; (clear carry as side effect)
+       mak     r5,r5,3<4>
+       bcnd    eq0,r5,Lzero
+
+       or      r12,r0,lo16(Lbase)
+       or.u    r12,r12,hi16(Lbase)
+       addu    r12,r12,r5              ; r12 is address for entering in loop
+
+       extu    r5,r5,2                 ; divide by 4
+       subu    r2,r2,r5                ; adjust res_ptr
+       subu    r3,r3,r5                ; adjust s1_ptr
+       subu    r4,r4,r5                ; adjust s2_ptr
+
+       or      r8,r6,r0
+
+       jmp.n   r12
+        or     r9,r7,r0
+
+Loop:  addu    r3,r3,32
+       st      r8,r2,28
+       addu    r4,r4,32
+       ld      r6,r3,0
+       addu    r2,r2,32
+       ld      r7,r4,0
+Lzero: subu    r10,r10,1               ; add 0 + 8r limbs (adj loop cnt)
+Lbase: ld      r8,r3,4
+       addu.cio r6,r6,r7
+       ld      r9,r4,4
+       st      r6,r2,0
+       ld      r6,r3,8                 ; add 7 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,8
+       st      r8,r2,4
+       ld      r8,r3,12                ; add 6 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,12
+       st      r6,r2,8
+       ld      r6,r3,16                ; add 5 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,16
+       st      r8,r2,12
+       ld      r8,r3,20                ; add 4 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,20
+       st      r6,r2,16
+       ld      r6,r3,24                ; add 3 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,24
+       st      r8,r2,20
+       ld      r8,r3,28                ; add 2 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,28
+       st      r6,r2,24
+       bcnd.n  ne0,r10,Loop            ; add 1 + 8r limbs
+        addu.cio r8,r8,r9
+
+       st      r8,r2,28                ; store most significant limb
+
+       jmp.n    r1
+        addu.ci r2,r0,r0               ; return carry-out from most sign. limb
diff --git a/sysdeps/m88k/m88100/mul_1.s b/sysdeps/m88k/m88100/mul_1.s

new file mode 100644 (file)

index 0000000..35c238d
--- /dev/null
+++ b/sysdeps/m88k/m88100/mul_1.s
@@ -0,0 +1,128 @@
+; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; size         r4
+; s2_limb      r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+       text
+       align    16
+       global   ___mpn_mul_1
+___mpn_mul_1:
+
+       ; Make S1_PTR and RES_PTR point at the end of their blocks
+       ; and negate SIZE.
+       lda      r3,r3[r4]
+       lda      r6,r2[r4]              ; RES_PTR in r6 since r2 is retval
+       subu     r4,r0,r4
+
+       addu.co  r2,r0,r0               ; r2 = cy = 0
+       ld       r9,r3[r4]
+       mask     r7,r5,0xffff           ; r7 = lo(S2_LIMB)
+       extu     r8,r5,16               ; r8 = hi(S2_LIMB)
+       bcnd.n   eq0,r8,Lsmall          ; jump if (hi(S2_LIMB) == 0)
+        subu    r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+       ; Make a stack frame and save r25 and r26
+       subu     r31,r31,16
+       st.d     r25,r31,8
+
+       ; Enter the loop in the middle
+       br.n    L1
+       addu     r4,r4,1
+
+Loop:
+       ld       r9,r3[r4]
+       st       r26,r6[r4]
+; bcnd ne0,r0,0                        ; bubble
+       addu     r4,r4,1
+L1:    mul      r26,r9,r5              ; low word of product   mul_1   WB ld
+       mask     r12,r9,0xffff          ; r12 = lo(s1_limb)     mask_1
+       mul      r11,r12,r7             ; r11 =  prod_0         mul_2   WB mask_1
+       mul      r10,r12,r8             ; r10 = prod_1a         mul_3
+       extu     r13,r9,16              ; r13 = hi(s1_limb)     extu_1  WB mul_1
+       mul      r12,r13,r7             ; r12 = prod_1b         mul_4   WB extu_1
+       mul      r25,r13,r8             ; r25  = prod_2         mul_5   WB mul_2
+       extu     r11,r11,16             ; r11 = hi(prod_0)      extu_2  WB mul_3
+       addu     r10,r10,r11            ;                       addu_1  WB extu_2
+; bcnd ne0,r0,0                        ; bubble                        WB addu_1
+       addu.co  r10,r10,r12            ;                               WB mul_4
+       mask.u   r10,r10,0xffff         ; move the 16 most significant bits...
+       addu.ci  r10,r10,r0             ; ...to the low half of the word...
+       rot      r10,r10,16             ; ...and put carry in pos 16.
+       addu.co  r26,r26,r2             ; add old carry limb
+       bcnd.n   ne0,r4,Loop
+        addu.ci r2,r25,r10             ; compute new carry limb
+
+       st       r26,r6[r4]
+       ld.d     r25,r31,8
+       jmp.n    r1
+        addu    r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+       ; Enter the loop in the middle
+       br.n    SL1
+       addu     r4,r4,1
+
+SLoop:
+       ld       r9,r3[r4]              ;
+       st       r8,r6[r4]              ;
+       addu     r4,r4,1                ;
+SL1:   mul      r8,r9,r5               ; low word of product
+       mask     r12,r9,0xffff          ; r12 = lo(s1_limb)
+       extu     r13,r9,16              ; r13 = hi(s1_limb)
+       mul      r11,r12,r7             ; r11 =  prod_0
+       mul      r12,r13,r7             ; r12 = prod_1b
+       addu.cio r8,r8,r2               ; add old carry limb
+       extu     r10,r11,16             ; r11 = hi(prod_0)
+       addu     r10,r10,r12            ;
+       bcnd.n   ne0,r4,SLoop
+       extu     r2,r10,16              ; r2 = new carry limb
+
+       jmp.n    r1
+       st       r8,r6[r4]
diff --git a/sysdeps/m88k/m88100/sub_n.s b/sysdeps/m88k/m88100/sub_n.s

new file mode 100644 (file)

index 0000000..3963cd5
--- /dev/null
+++ b/sysdeps/m88k/m88100/sub_n.s
@@ -0,0 +1,104 @@
+; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; s2_ptr       r4
+; size         r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+       text
+       align    16
+       global   ___mpn_sub_n
+___mpn_sub_n:
+       ld      r6,r3,0                 ; read first limb from s1_ptr
+       extu    r10,r5,3
+       ld      r7,r4,0                 ; read first limb from s2_ptr
+
+       subu.co r5,r0,r5                ; (clear carry as side effect)
+       mak     r5,r5,3<4>
+       bcnd    eq0,r5,Lzero
+
+       or      r12,r0,lo16(Lbase)
+       or.u    r12,r12,hi16(Lbase)
+       addu    r12,r12,r5              ; r12 is address for entering in loop
+
+       extu    r5,r5,2                 ; divide by 4
+       subu    r2,r2,r5                ; adjust res_ptr
+       subu    r3,r3,r5                ; adjust s1_ptr
+       subu    r4,r4,r5                ; adjust s2_ptr
+
+       or      r8,r6,r0
+
+       jmp.n   r12
+        or     r9,r7,r0
+
+Loop:  addu    r3,r3,32
+       st      r8,r2,28
+       addu    r4,r4,32
+       ld      r6,r3,0
+       addu    r2,r2,32
+       ld      r7,r4,0
+Lzero: subu    r10,r10,1               ; subtract 0 + 8r limbs (adj loop cnt)
+Lbase: ld      r8,r3,4
+       subu.cio r6,r6,r7
+       ld      r9,r4,4
+       st      r6,r2,0
+       ld      r6,r3,8                 ; subtract 7 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,8
+       st      r8,r2,4
+       ld      r8,r3,12                ; subtract 6 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,12
+       st      r6,r2,8
+       ld      r6,r3,16                ; subtract 5 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,16
+       st      r8,r2,12
+       ld      r8,r3,20                ; subtract 4 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,20
+       st      r6,r2,16
+       ld      r6,r3,24                ; subtract 3 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,24
+       st      r8,r2,20
+       ld      r8,r3,28                ; subtract 2 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,28
+       st      r6,r2,24
+       bcnd.n  ne0,r10,Loop            ; subtract 1 + 8r limbs
+        subu.cio r8,r8,r9
+
+       st      r8,r2,28                ; store most significant limb
+
+       addu.ci r2,r0,r0                ; return carry-out from most sign. limb
+       jmp.n    r1
+        xor    r2,r2,1
diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s

new file mode 100644 (file)

index 0000000..08c3ca0
--- /dev/null
+++ b/sysdeps/m88k/m88110/mul_1.s
@@ -0,0 +1,84 @@
+; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; size         r4
+; s2_limb      r5
+
+       text
+       align   16
+       global  ___mpn_mul_1
+___mpn_mul_1:
+       ; Make S1_PTR and RES_PTR point at the end of their blocks
+       ; and negate SIZE.
+       lda      r3,r3[r4]
+       lda      r8,r2[r4]              ; RES_PTR in r8 since r2 is retval
+       subu     r4,r0,r4
+
+       addu.co  r2,r0,r0               ; r2 = cy = 0
+
+       ld       r6,r3[r4]
+       addu     r4,r4,1
+       mulu.d   r10,r6,r5
+       bcnd.n   eq0,r4,Lend
+        subu    r8,r8,8
+
+Loop:  ld       r6,r3[r4]
+       addu.cio r9,r11,r2
+       or       r2,r10,r0              ; could be avoided if unrolled
+       addu     r4,r4,1
+       mulu.d   r10,r6,r5
+       bcnd.n   ne0,r4,Loop
+        st      r9,r8[r4]
+
+Lend:  addu.cio r9,r11,r2
+       st       r9,r8,4
+       jmp.n    r1
+        addu.ci r2,r10,r0
+
+; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d  ,r11,r5
+;      ld.d    r12,
+;      mulu.d  ,r10,r5
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
diff --git a/sysdeps/mips/add_n.s b/sysdeps/mips/add_n.s

new file mode 100644 (file)

index 0000000..c829108
--- /dev/null
+++ b/sysdeps/mips/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)
+       lw      $11,0($6)
+
+       addiu   $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       subu    $7,$7,$9
+
+.Loop0:        addiu   $9,$9,-1
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,4
+       addiu   $6,$6,4
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        addiu  $4,$4,4
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: addiu   $7,$7,-4
+
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       lw      $10,8($5)
+       addu    $13,$13,$2
+       lw      $11,8($6)
+       sltu    $8,$13,$2
+       addu    $13,$12,$13
+       sltu    $2,$13,$12
+       sw      $13,4($4)
+       or      $2,$2,$8
+
+       lw      $12,12($5)
+       addu    $11,$11,$2
+       lw      $13,12($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,8($4)
+       or      $2,$2,$8
+
+       lw      $10,16($5)
+       addu    $13,$13,$2
+       lw      $11,16($6)
+       sltu    $8,$13,$2
+       addu    $13,$12,$13
+       sltu    $2,$13,$12
+       sw      $13,12($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,16
+       addiu   $6,$6,16
+
+       bne     $7,$0,.Loop
+        addiu  $4,$4,16
+
+.Lend: addu    $11,$11,$2
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_add_n
diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s

new file mode 100644 (file)

index 0000000..abc2fb8
--- /dev/null
+++ b/sysdeps/mips/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_addmul_1
+       .ent    __mpn_addmul_1
+__mpn_addmul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $3,$3,$2        # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       multu   $8,$7
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/mips/lshift.s b/sysdeps/mips/lshift.s

new file mode 100644 (file)

index 0000000..ce33e7c
--- /dev/null
+++ b/sysdeps/mips/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS2 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .set    noreorder
+       .set    nomacro
+
+       sll     $2,$6,2
+       addu    $5,$5,$2        # make r5 point at end of src
+       lw      $10,-4($5)      # load first limb
+       subu    $13,$0,$7
+       addu    $4,$4,$2        # make r4 point at end of res
+       addiu   $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        srl    $2,$10,$13      # compute function result
+
+       subu    $6,$6,$9
+
+.Loop0:        lw      $3,-8($5)
+       addiu   $4,$4,-4
+       addiu   $5,$5,-4
+       addiu   $9,$9,-1
+       sll     $11,$10,$7
+       srl     $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sw     $8,0($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: lw      $3,-8($5)
+       addiu   $4,$4,-16
+       addiu   $6,$6,-4
+       sll     $11,$10,$7
+       srl     $12,$3,$13
+
+       lw      $10,-12($5)
+       sll     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,12($4)
+       srl     $9,$10,$13
+
+       lw      $3,-16($5)
+       sll     $11,$10,$7
+       or      $8,$14,$9
+       sw      $8,8($4)
+       srl     $12,$3,$13
+
+       lw      $10,-20($5)
+       sll     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,4($4)
+       srl     $9,$10,$13
+
+       addiu   $5,$5,-16
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sw     $8,0($4)
+
+.Lend: sll     $8,$10,$7
+       j       $31
+       sw      $8,-4($4)
+       .end    __mpn_lshift
diff --git a/sysdeps/mips/mips3/add_n.s b/sysdeps/mips/mips3/add_n.s

new file mode 100644 (file)

index 0000000..b525780
--- /dev/null
+++ b/sysdeps/mips/mips3/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)
+       ld      $11,0($6)
+
+       daddiu  $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       dsubu   $7,$7,$9
+
+.Loop0:        daddiu  $9,$9,-1
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,8
+       daddiu  $6,$6,8
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        daddiu $4,$4,8
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: daddiu  $7,$7,-4
+
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       ld      $10,16($5)
+       daddu   $13,$13,$2
+       ld      $11,16($6)
+       sltu    $8,$13,$2
+       daddu   $13,$12,$13
+       sltu    $2,$13,$12
+       sd      $13,8($4)
+       or      $2,$2,$8
+
+       ld      $12,24($5)
+       daddu   $11,$11,$2
+       ld      $13,24($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,16($4)
+       or      $2,$2,$8
+
+       ld      $10,32($5)
+       daddu   $13,$13,$2
+       ld      $11,32($6)
+       sltu    $8,$13,$2
+       daddu   $13,$12,$13
+       sltu    $2,$13,$12
+       sd      $13,24($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,32
+       daddiu  $6,$6,32
+
+       bne     $7,$0,.Loop
+        daddiu $4,$4,32
+
+.Lend: daddu   $11,$11,$2
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_add_n
diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s

new file mode 100644 (file)

index 0000000..7af0172
--- /dev/null
+++ b/sysdeps/mips/mips3/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_addmul_1
+       .ent    __mpn_addmul_1
+__mpn_addmul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $3,$3,$2        # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dmultu  $8,$7
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/mips/mips3/gmp-mparam.h b/sysdeps/mips/mips3/gmp-mparam.h

new file mode 100644 (file)

index 0000000..a801b35
--- /dev/null
+++ b/sysdeps/mips/mips3/gmp-mparam.h
@@ -0,0 +1,26 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/sysdeps/mips/mips3/lshift.s b/sysdeps/mips/mips3/lshift.s

new file mode 100644 (file)

index 0000000..c05dcaf
--- /dev/null
+++ b/sysdeps/mips/mips3/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS3 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .set    noreorder
+       .set    nomacro
+
+       dsll    $2,$6,3
+       daddu   $5,$5,$2        # make r5 point at end of src
+       ld      $10,-8($5)      # load first limb
+       dsubu   $13,$0,$7
+       daddu   $4,$4,$2        # make r4 point at end of res
+       daddiu  $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        dsrl   $2,$10,$13      # compute function result
+
+       dsubu   $6,$6,$9
+
+.Loop0:        ld      $3,-16($5)
+       daddiu  $4,$4,-8
+       daddiu  $5,$5,-8
+       daddiu  $9,$9,-1
+       dsll    $11,$10,$7
+       dsrl    $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sd     $8,0($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: ld      $3,-16($5)
+       daddiu  $4,$4,-32
+       daddiu  $6,$6,-4
+       dsll    $11,$10,$7
+       dsrl    $12,$3,$13
+
+       ld      $10,-24($5)
+       dsll    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,24($4)
+       dsrl    $9,$10,$13
+
+       ld      $3,-32($5)
+       dsll    $11,$10,$7
+       or      $8,$14,$9
+       sd      $8,16($4)
+       dsrl    $12,$3,$13
+
+       ld      $10,-40($5)
+       dsll    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,8($4)
+       dsrl    $9,$10,$13
+
+       daddiu  $5,$5,-32
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sd     $8,0($4)
+
+.Lend: dsll    $8,$10,$7
+       j       $31
+       sd      $8,-8($4)
+       .end    __mpn_lshift
diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s

new file mode 100644 (file)

index 0000000..87954e5
--- /dev/null
+++ b/sysdeps/mips/mips3/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_mul_1
+       .ent    __mpn_mul_1
+__mpn_mul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  mflo    $10
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $10,$10,$2      # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$10,$2       # carry from previous addition -> $2
+       sd      $10,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  mflo    $10
+       mfhi    $9
+       daddu   $10,$10,$2
+       sltu    $2,$10,$2
+       dmultu  $8,$7
+       sd      $10,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  mflo    $10
+       mfhi    $9
+       daddu   $10,$10,$2
+       sltu    $2,$10,$2
+       sd      $10,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/mips/mips3/rshift.s b/sysdeps/mips/mips3/rshift.s

new file mode 100644 (file)

index 0000000..e0e2ca2
--- /dev/null
+++ b/sysdeps/mips/mips3/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS3 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)       # load first limb
+       dsubu   $13,$0,$7
+       daddiu  $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        dsll   $2,$10,$13      # compute function result
+
+       dsubu   $6,$6,$9
+
+.Loop0:        ld      $3,8($5)
+       daddiu  $4,$4,8
+       daddiu  $5,$5,8
+       daddiu  $9,$9,-1
+       dsrl    $11,$10,$7
+       dsll    $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sd     $8,-8($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: ld      $3,8($5)
+       daddiu  $4,$4,32
+       daddiu  $6,$6,-4
+       dsrl    $11,$10,$7
+       dsll    $12,$3,$13
+
+       ld      $10,16($5)
+       dsrl    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,-32($4)
+       dsll    $9,$10,$13
+
+       ld      $3,24($5)
+       dsrl    $11,$10,$7
+       or      $8,$14,$9
+       sd      $8,-24($4)
+       dsll    $12,$3,$13
+
+       ld      $10,32($5)
+       dsrl    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,-16($4)
+       dsll    $9,$10,$13
+
+       daddiu  $5,$5,32
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sd     $8,-8($4)
+
+.Lend: dsrl    $8,$10,$7
+       j       $31
+       sd      $8,0($4)
+       .end    __mpn_rshift
diff --git a/sysdeps/mips/mips3/sub_n.s b/sysdeps/mips/mips3/sub_n.s

new file mode 100644 (file)

index 0000000..9a45ffd
--- /dev/null
+++ b/sysdeps/mips/mips3/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)
+       ld      $11,0($6)
+
+       daddiu  $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       dsubu   $7,$7,$9
+
+.Loop0:        daddiu  $9,$9,-1
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,8
+       daddiu  $6,$6,8
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        daddiu $4,$4,8
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: daddiu  $7,$7,-4
+
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       ld      $10,16($5)
+       daddu   $13,$13,$2
+       ld      $11,16($6)
+       sltu    $8,$13,$2
+       dsubu   $13,$12,$13
+       sltu    $2,$12,$13
+       sd      $13,8($4)
+       or      $2,$2,$8
+
+       ld      $12,24($5)
+       daddu   $11,$11,$2
+       ld      $13,24($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,16($4)
+       or      $2,$2,$8
+
+       ld      $10,32($5)
+       daddu   $13,$13,$2
+       ld      $11,32($6)
+       sltu    $8,$13,$2
+       dsubu   $13,$12,$13
+       sltu    $2,$12,$13
+       sd      $13,24($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,32
+       daddiu  $6,$6,32
+
+       bne     $7,$0,.Loop
+        daddiu $4,$4,32
+
+.Lend: daddu   $11,$11,$2
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s

new file mode 100644 (file)

index 0000000..f28c6a5
--- /dev/null
+++ b/sysdeps/mips/mips3/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_submul_1
+       .ent    __mpn_submul_1
+__mpn_submul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $3,$3,$2        # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dmultu  $8,$7
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s

new file mode 100644 (file)

index 0000000..01327e2
--- /dev/null
+++ b/sysdeps/mips/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_mul_1
+       .ent    __mpn_mul_1
+__mpn_mul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  mflo    $10
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $10,$10,$2      # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$10,$2       # carry from previous addition -> $2
+       sw      $10,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  mflo    $10
+       mfhi    $9
+       addu    $10,$10,$2
+       sltu    $2,$10,$2
+       multu   $8,$7
+       sw      $10,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  mflo    $10
+       mfhi    $9
+       addu    $10,$10,$2
+       sltu    $2,$10,$2
+       sw      $10,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/mips/rshift.s b/sysdeps/mips/rshift.s

new file mode 100644 (file)

index 0000000..6941691
--- /dev/null
+++ b/sysdeps/mips/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS2 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)       # load first limb
+       subu    $13,$0,$7
+       addiu   $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        sll    $2,$10,$13      # compute function result
+
+       subu    $6,$6,$9
+
+.Loop0:        lw      $3,4($5)
+       addiu   $4,$4,4
+       addiu   $5,$5,4
+       addiu   $9,$9,-1
+       srl     $11,$10,$7
+       sll     $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sw     $8,-4($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: lw      $3,4($5)
+       addiu   $4,$4,16
+       addiu   $6,$6,-4
+       srl     $11,$10,$7
+       sll     $12,$3,$13
+
+       lw      $10,8($5)
+       srl     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,-16($4)
+       sll     $9,$10,$13
+
+       lw      $3,12($5)
+       srl     $11,$10,$7
+       or      $8,$14,$9
+       sw      $8,-12($4)
+       sll     $12,$3,$13
+
+       lw      $10,16($5)
+       srl     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,-8($4)
+       sll     $9,$10,$13
+
+       addiu   $5,$5,16
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sw     $8,-4($4)
+
+.Lend: srl     $8,$10,$7
+       j       $31
+       sw      $8,0($4)
+       .end    __mpn_rshift
diff --git a/sysdeps/mips/sub_n.s b/sysdeps/mips/sub_n.s

new file mode 100644 (file)

index 0000000..63f3b55
--- /dev/null
+++ b/sysdeps/mips/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)
+       lw      $11,0($6)
+
+       addiu   $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       subu    $7,$7,$9
+
+.Loop0:        addiu   $9,$9,-1
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,4
+       addiu   $6,$6,4
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        addiu  $4,$4,4
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: addiu   $7,$7,-4
+
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       lw      $10,8($5)
+       addu    $13,$13,$2
+       lw      $11,8($6)
+       sltu    $8,$13,$2
+       subu    $13,$12,$13
+       sltu    $2,$12,$13
+       sw      $13,4($4)
+       or      $2,$2,$8
+
+       lw      $12,12($5)
+       addu    $11,$11,$2
+       lw      $13,12($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,8($4)
+       or      $2,$2,$8
+
+       lw      $10,16($5)
+       addu    $13,$13,$2
+       lw      $11,16($6)
+       sltu    $8,$13,$2
+       subu    $13,$12,$13
+       sltu    $2,$12,$13
+       sw      $13,12($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,16
+       addiu   $6,$6,16
+
+       bne     $7,$0,.Loop
+        addiu  $4,$4,16
+
+.Lend: addu    $11,$11,$2
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s

new file mode 100644 (file)

index 0000000..616dd1b
--- /dev/null
+++ b/sysdeps/mips/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_submul_1
+       .ent    __mpn_submul_1
+__mpn_submul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $3,$3,$2        # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       multu   $8,$7
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s

new file mode 100644 (file)

index 0000000..34ad9e1
--- /dev/null
+++ b/sysdeps/rs6000/add_n.s
@@ -0,0 +1,54 @@
+# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# s2_ptr       r5
+# size         r6
+
+       .toc
+       .extern __mpn_add_n[DS]
+       .extern .__mpn_add_n
+.csect [PR]
+       .align 2
+       .globl __mpn_add_n
+       .globl .__mpn_add_n
+       .csect __mpn_add_n[DS]
+__mpn_add_n:
+       .long .__mpn_add_n, TOC[tc0], 0
+       .csect [PR]
+.__mpn_add_n:
+       mtctr   6               # copy size into CTR
+       l       8,0(4)          # load least significant s1 limb
+       l       0,0(5)          # load least significant s2 limb
+       cal     3,-4(3)         # offset res_ptr, it's updated before used
+       a       7,0,8           # add least significant limbs, set cy
+       bdz     Lend            # If done, skip loop
+Loop:  lu      8,4(4)          # load s1 limb and update s1_ptr
+       lu      0,4(5)          # load s2 limb and update s2_ptr
+       stu     7,4(3)          # store previous limb in load latecny slot
+       ae      7,0,8           # add new limbs with cy, set cy
+       bdn     Loop            # decrement CTR and loop back
+Lend:  st      7,4(3)          # store ultimate result limb
+       lil     3,0             # load cy into ...
+       aze     3,3             # ... return value register
+       br
diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s

new file mode 100644 (file)

index 0000000..862b613
--- /dev/null
+++ b/sysdeps/rs6000/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_addmul_1[PR]
+       .align 2
+       .globl __mpn_addmul_1
+       .globl .__mpn_addmul_1
+       .csect __mpn_addmul_1[DS]
+__mpn_addmul_1:
+       .long .__mpn_addmul_1[PR], TOC[tc0], 0
+       .csect .__mpn_addmul_1[PR]
+.__mpn_addmul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    8
+       cax     9,9,7
+       l       7,4(3)
+       a       8,8,7           # add res_limb
+       blt     Lneg
+Lpos:  bdz     Lend
+
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      8,0,9           # low limb + old_cy_limb + old cy
+       l       7,4(3)
+       aze     10,10           # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      8,0,10
+       l       7,4(3)
+       aze     9,9
+       a       8,8,7
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    7
+       ae      8,7,9
+       l       7,4(3)
+       ae      10,10,0         # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    7
+       ae      8,7,10
+       l       7,4(3)
+       ae      9,9,0           # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s

new file mode 100644 (file)

index 0000000..69c7502
--- /dev/null
+++ b/sysdeps/rs6000/lshift.s
@@ -0,0 +1,58 @@
+# IBM POWER __mpn_lshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s_ptr                r4
+# size         r5
+# cnt          r6
+
+       .toc
+       .extern __mpn_lshift[DS]
+       .extern .__mpn_lshift
+.csect [PR]
+       .align 2
+       .globl __mpn_lshift
+       .globl .__mpn_lshift
+       .csect __mpn_lshift[DS]
+__mpn_lshift:
+       .long .__mpn_lshift, TOC[tc0], 0
+       .csect [PR]
+.__mpn_lshift:
+       sli     0,5,2
+       cax     9,3,0
+       cax     4,4,0
+       sfi     8,6,32
+       mtctr   5               # put limb count in CTR loop register
+       lu      0,-4(4)         # read most significant limb
+       sre     3,0,8           # compute carry out limb, and init MQ register
+       bdz     Lend2           # if just one limb, skip loop
+       lu      0,-4(4)         # read 2:nd most significant limb
+       sreq    7,0,8           # compute most significant limb of result
+       bdz     Lend            # if just two limb, skip loop
+Loop:  lu      0,-4(4)         # load next lower limb
+       stu     7,-4(9)         # store previous result during read latency
+       sreq    7,0,8           # compute result limb
+       bdn     Loop            # loop back until CTR is zero
+Lend:  stu     7,-4(9)         # store 2:nd least significant limb
+Lend2: sle     7,0,6           # compute least significant limb
+       st      7,-4(9)         # store it"                             \
+       br
diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s

new file mode 100644 (file)

index 0000000..f4fa894
--- /dev/null
+++ b/sysdeps/rs6000/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_mul_1[PR]
+       .align 2
+       .globl __mpn_mul_1
+       .globl .__mpn_mul_1
+       .csect __mpn_mul_1[DS]
+__mpn_mul_1:
+       .long .__mpn_mul_1[PR], TOC[tc0], 0
+       .csect .__mpn_mul_1[PR]
+.__mpn_mul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    8
+       ai      0,0,0           # reset carry
+       cax     9,9,7
+       blt     Lneg
+Lpos:  bdz     Lend
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      8,0,9
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      8,0,10
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       cax     10,10,0         # adjust high limb for negative s2_limb
+       mfmq    0
+       ae      8,0,9
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       cax     9,9,0           # adjust high limb for negative s2_limb
+       mfmq    0
+       ae      8,0,10
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s

new file mode 100644 (file)

index 0000000..6056acc
--- /dev/null
+++ b/sysdeps/rs6000/rshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __mpn_rshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s_ptr                r4
+# size         r5
+# cnt          r6
+
+       .toc
+       .extern __mpn_rshift[DS]
+       .extern .__mpn_rshift
+.csect [PR]
+       .align 2
+       .globl __mpn_rshift
+       .globl .__mpn_rshift
+       .csect __mpn_rshift[DS]
+__mpn_rshift:
+       .long .__mpn_rshift, TOC[tc0], 0
+       .csect [PR]
+.__mpn_rshift:
+       sfi     8,6,32
+       mtctr   5               # put limb count in CTR loop register
+       l       0,0(4)          # read least significant limb
+       ai      9,3,-4          # adjust res_ptr since it's offset in the stu:s
+       sle     3,0,8           # compute carry limb, and init MQ register
+       bdz     Lend2           # if just one limb, skip loop
+       lu      0,4(4)          # read 2:nd least significant limb
+       sleq    7,0,8           # compute least significant limb of result
+       bdz     Lend            # if just two limb, skip loop
+Loop:  lu      0,4(4)          # load next higher limb
+       stu     7,4(9)          # store previous result during read latency
+       sleq    7,0,8           # compute result limb
+       bdn     Loop            # loop back until CTR is zero
+Lend:  stu     7,4(9)          # store 2:nd most significant limb
+Lend2: sre     7,0,6           # compute most significant limb
+       st      7,4(9)          # store it"                             \
+       br
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s

new file mode 100644 (file)

index 0000000..402fdce
--- /dev/null
+++ b/sysdeps/rs6000/sub_n.s
@@ -0,0 +1,55 @@
+# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# s2_ptr       r5
+# size         r6
+
+       .toc
+       .extern __mpn_sub_n[DS]
+       .extern .__mpn_sub_n
+.csect [PR]
+       .align 2
+       .globl __mpn_sub_n
+       .globl .__mpn_sub_n
+       .csect __mpn_sub_n[DS]
+__mpn_sub_n:
+       .long .__mpn_sub_n, TOC[tc0], 0
+       .csect [PR]
+.__mpn_sub_n:
+       mtctr   6               # copy size into CTR
+       l       8,0(4)          # load least significant s1 limb
+       l       0,0(5)          # load least significant s2 limb
+       cal     3,-4(3)         # offset res_ptr, it's updated before used
+       sf      7,0,8           # add least significant limbs, set cy
+       bdz     Lend            # If done, skip loop
+Loop:  lu      8,4(4)          # load s1 limb and update s1_ptr
+       lu      0,4(5)          # load s2 limb and update s2_ptr
+       stu     7,4(3)          # store previous limb in load latecny slot
+       sfe     7,0,8           # add new limbs with cy, set cy
+       bdn     Loop            # decrement CTR and loop back
+Lend:  st      7,4(3)          # store ultimate result limb
+       sfe     3,0,0           # load !cy into ...
+       sfi     3,3,0           # ... return value register
+       br
diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s

new file mode 100644 (file)

index 0000000..2526332
--- /dev/null
+++ b/sysdeps/rs6000/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_submul_1[PR]
+       .align 2
+       .globl __mpn_submul_1
+       .globl .__mpn_submul_1
+       .csect __mpn_submul_1[DS]
+__mpn_submul_1:
+       .long .__mpn_submul_1[PR], TOC[tc0], 0
+       .csect .__mpn_submul_1[PR]
+.__mpn_submul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    11
+       cax     9,9,7
+       l       7,4(3)
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       blt     Lneg
+Lpos:  bdz     Lend
+
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      11,0,9          # low limb + old_cy_limb + old cy
+       l       7,4(3)
+       aze     10,10           # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      11,0,10
+       l       7,4(3)
+       aze     9,9
+       sf      8,11,7
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    7
+       ae      11,7,9
+       l       7,4(3)
+       ae      10,10,0         # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    7
+       ae      11,7,10
+       l       7,4(3)
+       ae      9,9,0           # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/sparc/add_n.S b/sysdeps/sparc/add_n.S

index 3be3e39b862d180c34c03e5f12617e054c3fa016..13704d32d2347ff4537214f9f3a13b18c2c4ba74 100644 (file)
--- a/sysdeps/sparc/add_n.S
+++ b/sysdeps/sparc/add_n.S
@@ -1,7 +1,7 @@
  ! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store
  ! sum in a third limb vector.
  
-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
  
  ! This file is part of the GNU MP Library.
  
@@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_add_n):
         sub     %g0,%o3,%o3
         andcc   %o3,(16-1),%o3
         be      Lzero
-        nop
+        mov    %o4,%g2                 ! put first s1_limb in g2 too
  
         sll     %o3,2,%o3               ! multiply by 4
         sub     %o0,%o3,%o0             ! adjust res_ptr
         sub     %o1,%o3,%o1             ! adjust s1_ptr
         sub     %o2,%o3,%o2             ! adjust s2_ptr
  
-       mov     %o4,%g2
-
+#if PIC
+       mov     %o7,%g4                 ! Save return address register
+       call    1f
+       add     %o7,Lbase-1f,%g3
+1:     mov     %g4,%o7                 ! Restore return address register
+#else
         sethi   %hi(Lbase),%g3
         or      %g3,%lo(Lbase),%g3
+#endif
         sll     %o3,2,%o3               ! multiply by 4
         jmp     %g3+%o3
-        mov    %o5,%g3
+        mov    %o5,%g3                 ! put first s2_limb in g3 too
  
  Loop:  addxcc  %g2,%g3,%o3
         add     %o1,64,%o1
diff --git a/sysdeps/sparc/sparc8/addmul_1.S b/sysdeps/sparc/sparc8/addmul_1.S

index fbaacfda4ff5f4a084b1221002df6ba4f4cffcd6..d1de0c36495fc09ff2503b728b3e3652c04c9577 100644 (file)
--- a/sysdeps/sparc/sparc8/addmul_1.S
+++ b/sysdeps/sparc/sparc8/addmul_1.S
@@ -37,8 +37,15 @@ C_SYMBOL_NAME(__mpn_addmul_1):
  
         sll     %o2,4,%g1
         and     %g1,(4-1)<<4,%g1
+#if PIC
+       mov     %o7,%g4                 ! Save return address register
+       call    1f
+       add     %o7,LL-1f,%g3
+1:     mov     %g4,%o7                 ! Restore return address register
+#else
         sethi   %hi(LL),%g3
         or      %g3,%lo(LL),%g3
+#endif
         jmp     %g3+%g1
         nop
  LL:
diff --git a/sysdeps/sparc/sparc8/mul_1.S b/sysdeps/sparc/sparc8/mul_1.S

index 9c21768eb1fb1e97d847f9fcd4f90cb5244fb838..42717be33b3c57dce1bcc4c5959ff36821a6ed38 100644 (file)
--- a/sysdeps/sparc/sparc8/mul_1.S
+++ b/sysdeps/sparc/sparc8/mul_1.S
@@ -34,8 +34,15 @@
  C_SYMBOL_NAME(__mpn_mul_1):
         sll     %o2,4,%g1
         and     %g1,(4-1)<<4,%g1
+#if PIC
+       mov     %o7,%g4                 ! Save return address register
+       call    1f
+       add     %o7,LL-1f,%g3
+1:     mov     %g4,%o7                 ! Restore return address register
+#else
         sethi   %hi(LL),%g3
         or      %g3,%lo(LL),%g3
+#endif
         jmp     %g3+%g1
         ld      [%o1+0],%o4     ! 1
  LL:
diff --git a/sysdeps/sparc/sub_n.S b/sysdeps/sparc/sub_n.S

index 7a167b2ac12f4a3b19ed637b5b0a9289e12b1efc..6264344009138bc9082deaa733f32502515198be 100644 (file)
--- a/sysdeps/sparc/sub_n.S
+++ b/sysdeps/sparc/sub_n.S
@@ -1,7 +1,7 @@
  ! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
  ! store difference in a third limb vector.
  
-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
  
  ! This file is part of the GNU MP Library.
  
@@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_sub_n):
         sub     %g0,%o3,%o3
         andcc   %o3,(16-1),%o3
         be      Lzero
-        nop
+        mov    %o4,%g2                 ! put first s1_limb in g2 too
  
         sll     %o3,2,%o3               ! multiply by 4
         sub     %o0,%o3,%o0             ! adjust res_ptr
         sub     %o1,%o3,%o1             ! adjust s1_ptr
         sub     %o2,%o3,%o2             ! adjust s2_ptr
  
-       mov     %o4,%g2
-
+#if PIC
+       mov     %o7,%g4                 ! Save return address register
+       call    1f
+       add     %o7,Lbase-1f,%g3
+1:     mov     %g4,%o7                 ! Restore return address register
+#else
         sethi   %hi(Lbase),%g3
         or      %g3,%lo(Lbase),%g3
+#endif
         sll     %o3,2,%o3               ! multiply by 4
         jmp     %g3+%o3
-        mov    %o5,%g3
+        mov    %o5,%g3                 ! put first s2_limb in g3 too
  
  Loop:  subxcc  %g2,%g3,%o3
         add     %o1,64,%o1
diff --git a/sysdeps/unix/sysv/linux/Dist b/sysdeps/unix/sysv/linux/Dist

index db5ff9596afa53890fc2e02b40a7dc146110e9ff..d6124bd2ba005bfc3943f2644096b8bd02a4dff4 100644 (file)
--- a/sysdeps/unix/sysv/linux/Dist
+++ b/sysdeps/unix/sysv/linux/Dist
@@ -1,2 +1,3 @@
  sys/socketcall.h
  sys/timex.h
+nfs/nfs.h
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile

index 6e1dd8ccb2dc1d8f62a8ae5f1150e349f2cbc1e2..fcacc539935f3ac76120a2d6157eb7c3efbb6c2c 100644 (file)
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -20,7 +20,11 @@ sysdep_routines := $(sysdep_routines) ipc
  endif
  
  ifeq ($(subdir), socket)
-headers += sys/socketcall.h 
+headers += sys/socketcall.h
+endif
+
+ifeq ($(subdir), sunrpc)
+headers += nfs/nfs.h
  endif
  
  config-LDFLAGS = -Wl,-dynamic-linker=/lib/ld-gnu.so.1
diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h

index 7fe4d414e33d30ea56c8e419fbb677ad9936df6f..a40ca86e40fac3f4265f6ab205aa0f2d932494ad 100644 (file)
--- a/sysdeps/unix/sysv/linux/i386/sysdep.h
+++ b/sysdeps/unix/sysv/linux/i386/sysdep.h
@@ -93,43 +93,61 @@ Cambridge, MA 02139, USA.  */
     (2 * movl is less expensive than pushl + popl).
  
     Second unlike for the other registers we don't save the content of
-   %ecx and %edx when we have than 1 and 2 registers resp.  */
+   %ecx and %edx when we have than 1 and 2 registers resp.
+
+   The code below might look a bit long but we have to take care for
+   the pipelined processors (i586 and up).  Here the `pushl' and `popl'
+   instructions are marked as NP (not pairable) but the exception is
+   two consecutive of these instruction.  This gives no penalty on
+   i386 and i486 processors though.  */
  
  #undef DO_CALL
  #define DO_CALL(args)                                                        \
+    PUSHARGS_##args                                                          \
      DOARGS_##args                                                            \
-    int $0x80;                                                               \
-    UNDOARGS_##args
+    int $0x80                                                                \
+    POPARGS_##args
  
+#define PUSHARGS_0     /* No arguments to push.  */
  #define        DOARGS_0        /* No arguments to frob.  */
-#define        UNDOARGS_0      /* No arguments to unfrob.  */
-#define        _DOARGS_0(n)    /* No arguments to frob.  */
-#define        _UNDOARGS_0     /* No arguments to unfrob.  */
-
-#define        DOARGS_1        movl %ebx, %edx; movl 4(%esp), %ebx; DOARGS_0
-#define        UNDOARGS_1      UNDOARGS_0; movl %edx, %ebx
-#define        _DOARGS_1(n)    pushl %ebx; movl n+4(%esp), %ebx; _DOARGS_0 (n)
-#define        _UNDOARGS_1     _UNDOARGS_0; popl %ebx
-
-#define        DOARGS_2        movl 8(%esp), %ecx; DOARGS_1
-#define        UNDOARGS_2      UNDOARGS_1
+#define        POPARGS_0       /* No arguments to pop.  */
+#define        _PUSHARGS_0     /* No arguments to push.  */
+#define _DOARGS_0(n)   /* No arguments to frob.  */
+#define        _POPARGS_0      /* No arguments to pop.  */
+
+#define PUSHARGS_1     movl %ebx, %edx; PUSHARGS_0
+#define        DOARGS_1        _DOARGS_1 (4)
+#define        POPARGS_1       POPARGS_0; movl %edx, %ebx
+#define        _PUSHARGS_1     pushl %ebx; _PUSHARGS_0
+#define _DOARGS_1(n)   movl n(%esp), %ebx; _DOARGS_0(n-4)
+#define        _POPARGS_1      _POPARGS_0; popl %ebx
+
+#define PUSHARGS_2     PUSHARGS_1
+#define        DOARGS_2        _DOARGS_2 (8)
+#define        POPARGS_2       POPARGS_1
+#define _PUSHARGS_2    _PUSHARGS_1
  #define        _DOARGS_2(n)    movl n(%esp), %ecx; _DOARGS_1 (n-4)
-#define        _UNDOARGS_2     _UNDOARGS_1
+#define        _POPARGS_2      _POPARGS_1
  
-#define DOARGS_3       _DOARGS_3 (12)
-#define UNDOARGS_3     _UNDOARGS_3
+#define PUSHARGS_3     _PUSHARGS_2
+#define DOARGS_3       _DOARGS_3 (16)
+#define POPARGS_3      _POPARGS_3
+#define _PUSHARGS_3    _PUSHARGS_2
  #define _DOARGS_3(n)   movl n(%esp), %edx; _DOARGS_2 (n-4)
-#define _UNDOARGS_3    _UNDOARGS_2
-
-#define DOARGS_4       _DOARGS_4 (16)
-#define UNDOARGS_4     _UNDOARGS_4
-#define _DOARGS_4(n)   pushl %esi; movl n+4(%esp), %esi; _DOARGS_3 (n)
-#define _UNDOARGS_4    _UNDOARGS_3; popl %esi
-
-#define DOARGS_5       _DOARGS_5 (20)
-#define UNDOARGS_5     _UNDOARGS_5
-#define _DOARGS_5(n)   pushl %edi; movl n+4(%esp), %edi; _DOARGS_4 (n)
-#define _UNDOARGS_5    _UNDOARGS_4; popl %edi
-
+#define _POPARGS_3     _POPARGS_2
+
+#define PUSHARGS_4     _PUSHARGS_4
+#define DOARGS_4       _DOARGS_4 (24)
+#define POPARGS_4      _POPARGS_4
+#define _PUSHARGS_4    pushl %esi; _PUSHARGS_3
+#define _DOARGS_4(n)   movl n(%esp), %esi; _DOARGS_3 (n-4)
+#define _POPARGS_4     _POPARGS_3; popl %esi
+
+#define PUSHARGS_5     _PUSHARGS_5
+#define DOARGS_5       _DOARGS_5 (32)
+#define POPARGS_5      _POPARGS_5
+#define _PUSHARGS_5    pushl %edi; _PUSHARGS_4
+#define _DOARGS_5(n)   movl n(%esp), %edi; _DOARGS_4 (n-4)
+#define _POPARGS_5     _POPARGS_4; popl %edi
  
  #endif /* ASSEMBLER */
diff --git a/sysdeps/unix/sysv/linux/local_lim.h b/sysdeps/unix/sysv/linux/local_lim.h

index bfc65bd6fd4f9cd0ae33e10c34f69254fec12054..a1c81d87d995356094b09320cf5b89dcccf24ca7 100644 (file)
--- a/sysdeps/unix/sysv/linux/local_lim.h
+++ b/sysdeps/unix/sysv/linux/local_lim.h
@@ -1,6 +1,6 @@
-/* Minimum guaranteed maximum values for system limits.  Hurd version.
+/* Minimum guaranteed maximum values for system limits.  Linux version.
  
-Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
  This file is part of the GNU C Library.
  
  The GNU C Library is free software; you can redistribute it and/or
@@ -18,14 +18,5 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
  not, write to the Free Software Foundation, Inc., 675 Mass Ave,
  Cambridge, MA 02139, USA.  */
  
-/* Linux has a fixed limit of supplementary groups allocated with a
-   process.  This value is determined by the size of the `groups'
-   member of the `task_struct' structure in <linux/sched.h>.  */
-   
-#define NGROUPS_MAX    32
-
-
-/* Maximum size of file names.  Not all file system types support
-   this size but it is only a maximum value.  */
-
-#define NAME_MAX       255
+/* The kernel sources contain a file with all the needed information.  */
+#include <linux/limits.h>
diff --git a/sysdeps/unix/sysv/linux/nfs/nfs.h b/sysdeps/unix/sysv/linux/nfs/nfs.h

new file mode 100644 (file)

index 0000000..61e4b65
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/nfs/nfs.h
@@ -0,0 +1 @@
+#include <linux/nfs.h>
diff --git a/sysdeps/unix/sysv/linux/sys/param.h b/sysdeps/unix/sysv/linux/sys/param.h

index 652605e92a1e205746f8f575b0e77fd26dc1e168..a2d4984166f3bde59820840f054ee158af5e844e 100644 (file)
--- a/sysdeps/unix/sysv/linux/sys/param.h
+++ b/sysdeps/unix/sysv/linux/sys/param.h
@@ -1,3 +1,21 @@
+/* Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
  #ifndef _SYS_PARAM_H
  #define _SYS_PARAM_H
  
@@ -7,26 +25,21 @@
  
  #include <sys/types.h>
  
-/* Don't change it. H.J. */
-#ifdef OLD_LINUX
-#undef MAXHOSTNAMELEN
-#define MAXHOSTNAMELEN         8       /* max length of hostname */
-#endif
  
  #ifndef howmany
-#define howmany(x, y)  (((x)+((y)-1))/(y))
+# define howmany(x, y) (((x)+((y)-1))/(y))
  #endif
  
  #ifndef roundup
-#define roundup(x, y)  ((((x)+((y)-1))/(y))*(y))
+# define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
  #endif
  
  #define MAXPATHLEN      PATH_MAX
  #define NOFILE          OPEN_MAX
  
  /*  Following the information of some of the kernel people I here assume
- *  that block size (i.e. the value of stat.st_blocks) for all filesystem
- *  is 512 bytes.  If not tell me or HJ.  -- Uli */
+    that block size (i.e. the value of stat.st_blocks) for all filesystem
+    is 512 bytes.  If not tell HJ, Roland, or me.  -- drepper */
  #define DEV_BSIZE       512
  
  #endif
diff --git a/sysdeps/vax/add_n.s b/sysdeps/vax/add_n.s

new file mode 100644 (file)

index 0000000..c89b226
--- /dev/null
+++ b/sysdeps/vax/add_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# s2_ptr       (sp + 12)
+# size         (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_add_n
+___mpn_add_n:
+       .word   0x0
+       movl    16(ap),r0
+       movl    12(ap),r1
+       movl    8(ap),r2
+       movl    4(ap),r3
+       subl2   r4,r4
+
+Loop:
+       movl    (r2)+,r4
+       adwc    (r1)+,r4
+       movl    r4,(r3)+
+       jsobgtr r0,Loop
+
+       adwc    r0,r0
+       ret
diff --git a/sysdeps/vax/addmul_1.s b/sysdeps/vax/addmul_1.s

new file mode 100644 (file)

index 0000000..8e83204
--- /dev/null
+++ b/sysdeps/vax/addmul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_addmul_1
+___mpn_addmul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       addl2   r2,(r9)+
+       adwc    r1,r3
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       addl2   r10,(r9)+
+       adwc    r1,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/vax/gmp-mparam.h b/sysdeps/vax/gmp-mparam.h

new file mode 100644 (file)

index 0000000..687f12a
--- /dev/null
+++ b/sysdeps/vax/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/sysdeps/vax/mul_1.s b/sysdeps/vax/mul_1.s

new file mode 100644 (file)

index 0000000..3fe375b
--- /dev/null
+++ b/sysdeps/vax/mul_1.s
@@ -0,0 +1,122 @@
+# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_mul_1
+___mpn_mul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       movl    r2,(r9)+
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       movl    r2,(r9)+
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       movl    r2,(r9)+
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r1,r3
+       addl2   r11,r2
+       adwc    r6,r3
+       movl    r2,(r9)+
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r1,r11
+       addl2   r3,r10
+       adwc    r6,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/vax/sub_n.s b/sysdeps/vax/sub_n.s

new file mode 100644 (file)

index 0000000..300b4de
--- /dev/null
+++ b/sysdeps/vax/sub_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# s2_ptr       (sp + 12)
+# size         (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_sub_n
+___mpn_sub_n:
+       .word   0x0
+       movl    16(ap),r0
+       movl    12(ap),r1
+       movl    8(ap),r2
+       movl    4(ap),r3
+       subl2   r4,r4
+
+Loop:
+       movl    (r2)+,r4
+       sbwc    (r1)+,r4
+       movl    r4,(r3)+
+       jsobgtr r0,Loop
+
+       adwc    r0,r0
+       ret
diff --git a/sysdeps/vax/submul_1.s b/sysdeps/vax/submul_1.s

new file mode 100644 (file)

index 0000000..875cbfd
--- /dev/null
+++ b/sysdeps/vax/submul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_submul_1
+___mpn_submul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       subl2   r2,(r9)+
+       adwc    r1,r3
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       subl2   r10,(r9)+
+       adwc    r1,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/z8000/add_n.s b/sysdeps/z8000/add_n.s

new file mode 100644 (file)

index 0000000..21efaf5
--- /dev/null
+++ b/sysdeps/z8000/add_n.s
@@ -0,0 +1,52 @@
+! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! s2_ptr       r5
+! size         r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+       unseg
+       .text
+       even
+       global ___mpn_add_n
+___mpn_add_n:
+       pop     r0,@r6
+       pop     r1,@r5
+       add     r0,r1
+       ld      @r7,r0
+       dec     r4
+       jr      eq,Lend
+Loop:  pop     r0,@r6
+       pop     r1,@r5
+       adc     r0,r1
+       inc     r7,#2
+       ld      @r7,r0
+       dec     r4
+       jr      ne,Loop
+Lend:  ld      r2,r4           ! use 0 already in r4
+       adc     r2,r2
+       ret     t
diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s

new file mode 100644 (file)

index 0000000..2075225
--- /dev/null
+++ b/sysdeps/z8000/mul_1.s
@@ -0,0 +1,67 @@
+! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! size         r5
+! s2_limb      r4
+
+       unseg
+       .text
+       even
+       global ___mpn_mul_1
+___mpn_mul_1:
+       sub     r2,r2           ! zero carry limb
+       and     r4,r4
+       jr      mi,Lneg
+
+Lpos:  pop     r1,@r6
+       ld      r9,r1
+       mult    rr8,r4
+       and     r1,r1           ! shift msb of loaded limb into cy
+       jr      mi,Lp           ! branch if loaded limb's msb is set
+       add     r8,r4           ! hi_limb += sign_comp2
+Lp:    add     r9,r2           ! lo_limb += cy_limb
+       xor     r2,r2
+       adc     r2,r8
+       ld      @r7,r9
+       inc     r7,#2
+       dec     r5
+       jr      ne,Lpos
+       ret t
+
+Lneg:  pop     r1,@r6
+       ld      r9,r1
+       mult    rr8,r4
+       add     r8,r1           ! hi_limb += sign_comp1
+       and     r1,r1
+       jr      mi,Ln
+       add     r8,r4           ! hi_limb += sign_comp2
+Ln:    add     r9,r2           ! lo_limb += cy_limb
+       xor     r2,r2
+       adc     r2,r8
+       ld      @r7,r9
+       inc     r7,#2
+       dec     r5
+       jr      ne,Lneg
+       ret t
diff --git a/sysdeps/z8000/sub_n.s b/sysdeps/z8000/sub_n.s

new file mode 100644 (file)

index 0000000..f75ef22
--- /dev/null
+++ b/sysdeps/z8000/sub_n.s
@@ -0,0 +1,53 @@
+! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! s2_ptr       r5
+! size         r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+       unseg
+       .text
+       even
+       global ___mpn_sub_n
+___mpn_sub_n:
+       pop     r0,@r6
+       pop     r1,@r5
+       sub     r0,r1
+       ld      @r7,r0
+       dec     r4
+       jr      eq,Lend
+Loop:  pop     r0,@r6
+       pop     r1,@r5
+       sbc     r0,r1
+       inc     r7,#2
+       ld      @r7,r0
+       dec     r4
+       jr      ne,Loop
+Lend:  ld      r2,r4           ! use 0 already in r4
+       adc     r2,r2
+       ret     t
author	Roland McGrath <roland@gnu.org>
	Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)
committer	Roland McGrath <roland@gnu.org>
	Mon, 16 Oct 1995 01:37:51 +0000 (01:37 +0000)
.cvsignore		patch \| blob \| history
ChangeLog		patch \| blob \| history
configure.in		patch \| blob \| history
hurd/Makefile		patch \| blob \| history
hurd/hurd.h		patch \| blob \| history
hurd/hurdinit.c		patch \| blob \| history
stdio/_itoa.c		patch \| blob \| history
stdio/_itoa.h		patch \| blob \| history
stdio/test_rdwr.c		patch \| blob \| history
stdlib/gmp-impl.h		patch \| blob \| history
stdlib/gmp.h		patch \| blob \| history
stdlib/longlong.h		patch \| blob \| history
sysdeps/alpha/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/udiv_qrnnd.S		patch \| blob \| history
sysdeps/generic/divmod_1.c		patch \| blob \| history
sysdeps/generic/mod_1.c		patch \| blob \| history
sysdeps/hppa/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/udiv_qrnnd.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/udiv_qrnnd.s	[new file with mode: 0644]	patch \| blob
sysdeps/i386/add_n.S		patch \| blob \| history
sysdeps/i386/gmp-mparam.h	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i486/strcat.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i486/strlen.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/Implies	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/add_n.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/addmul_1.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/lshift.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/memcopy.h		patch \| blob \| history
sysdeps/i386/i586/mul_1.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/rshift.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/strchr.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/strlen.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/sub_n.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/i586/submul_1.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/memchr.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/memchr.c	[deleted file]	patch \| blob \| history
sysdeps/i386/memcmp.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/stpcpy.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/stpncpy.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/strchr.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/strcspn.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/strpbrk.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/strrchr.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/strspn.S	[new file with mode: 0644]	patch \| blob
sysdeps/i386/sub_n.S		patch \| blob \| history
sysdeps/i960/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88110/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/gmp-mparam.h	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/sparc/add_n.S		patch \| blob \| history
sysdeps/sparc/sparc8/addmul_1.S		patch \| blob \| history
sysdeps/sparc/sparc8/mul_1.S		patch \| blob \| history
sysdeps/sparc/sub_n.S		patch \| blob \| history
sysdeps/unix/sysv/linux/Dist		patch \| blob \| history
sysdeps/unix/sysv/linux/Makefile		patch \| blob \| history
sysdeps/unix/sysv/linux/i386/sysdep.h		patch \| blob \| history
sysdeps/unix/sysv/linux/local_lim.h		patch \| blob \| history
sysdeps/unix/sysv/linux/nfs/nfs.h	[new file with mode: 0644]	patch \| blob
sysdeps/unix/sysv/linux/sys/param.h		patch \| blob \| history
sysdeps/vax/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/gmp-mparam.h	[new file with mode: 0644]	patch \| blob
sysdeps/vax/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/sub_n.s	[new file with mode: 0644]	patch \| blob