nptl/allocatestack.c

   1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <kernel-features.h>
  33
  34
  35 #ifndef NEED_SEPARATE_REGISTER_STACK
  36
  37 /* Most architectures have exactly one stack pointer.  Some have more.  */
  38 # define STACK_VARIABLES void *stackaddr = NULL
  39
  40 /* How to pass the values to the 'create_thread' function.  */
  41 # define STACK_VARIABLES_ARGS stackaddr
  42
  43 /* How to declare function which gets there parameters.  */
  44 # define STACK_VARIABLES_PARMS void *stackaddr
  45
  46 /* How to declare allocate_stack.  */
  47 # define ALLOCATE_STACK_PARMS void **stack
  48
  49 /* This is how the function is called.  We do it this way to allow
  50    other variants of the function to have more parameters.  */
  51 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  52
  53 #else
  54
  55 /* We need two stacks.  The kernel will place them but we have to tell
  56    the kernel about the size of the reserved address space.  */
  57 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  58
  59 /* How to pass the values to the 'create_thread' function.  */
  60 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  61
  62 /* How to declare function which gets there parameters.  */
  63 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  64
  65 /* How to declare allocate_stack.  */
  66 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  67
  68 /* This is how the function is called.  We do it this way to allow
  69    other variants of the function to have more parameters.  */
  70 # define ALLOCATE_STACK(attr, pd) \
  71   allocate_stack (attr, pd, &stackaddr, &stacksize)
  72
  73 #endif
  74
  75
  76 /* Default alignment of stack.  */
  77 #ifndef STACK_ALIGN
  78 # define STACK_ALIGN __alignof__ (long double)
  79 #endif
  80
  81 /* Default value for minimal stack size after allocating thread
  82    descriptor and guard.  */
  83 #ifndef MINIMAL_REST_STACK
  84 # define MINIMAL_REST_STACK     4096
  85 #endif
  86
  87
  88 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  89    a stack.  Use it when possible.  */
  90 #ifndef MAP_STACK
  91 # define MAP_STACK 0
  92 #endif
  93
  94 /* This yields the pointer that TLS support code calls the thread pointer.  */
  95 #if TLS_TCB_AT_TP
  96 # define TLS_TPADJ(pd) (pd)
  97 #elif TLS_DTV_AT_TP
  98 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  99 #endif
 100
 101 /* Cache handling for not-yet free stacks.  */
 102
 103 /* Maximum size in kB of cache.  */
 104 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 105 static size_t stack_cache_actsize;
 106
 107 /* Mutex protecting this variable.  */
 108 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 109
 110 /* List of queued stack frames.  */
 111 static LIST_HEAD (stack_cache);
 112
 113 /* List of the stacks in use.  */
 114 static LIST_HEAD (stack_used);
 115
 116 /* We need to record what list operations we are going to do so that,
 117    in case of an asynchronous interruption due to a fork() call, we
 118    can correct for the work.  */
 119 static uintptr_t in_flight_stack;
 120
 121 /* List of the threads with user provided stacks in use.  No need to
 122    initialize this, since it's done in __pthread_initialize_minimal.  */
 123 list_t __stack_user __attribute__ ((nocommon));
 124 hidden_data_def (__stack_user)
 125
 126 #if COLORING_INCREMENT != 0
 127 /* Number of threads created.  */
 128 static unsigned int nptl_ncreated;
 129 #endif
 130
 131
 132 /* Check whether the stack is still used or not.  */
 133 #define FREE_P(descr) ((descr)->tid <= 0)
 134
 135
 136 static void
 137 stack_list_del (list_t *elem)
 138 {
 139   in_flight_stack = (uintptr_t) elem;
 140
 141   atomic_write_barrier ();
 142
 143   list_del (elem);
 144
 145   atomic_write_barrier ();
 146
 147   in_flight_stack = 0;
 148 }
 149
 150
 151 static void
 152 stack_list_add (list_t *elem, list_t *list)
 153 {
 154   in_flight_stack = (uintptr_t) elem | 1;
 155
 156   atomic_write_barrier ();
 157
 158   list_add (elem, list);
 159
 160   atomic_write_barrier ();
 161
 162   in_flight_stack = 0;
 163 }
 164
 165
 166 /* We create a double linked list of all cache entries.  Double linked
 167    because this allows removing entries from the end.  */
 168
 169
 170 /* Get a stack frame from the cache.  We have to match by size since
 171    some blocks might be too small or far too large.  */
 172 static struct pthread *
 173 get_cached_stack (size_t *sizep, void **memp)
 174 {
 175   size_t size = *sizep;
 176   struct pthread *result = NULL;
 177   list_t *entry;
 178
 179   lll_lock (stack_cache_lock, LLL_PRIVATE);
 180
 181   /* Search the cache for a matching entry.  We search for the
 182      smallest stack which has at least the required size.  Note that
 183      in normal situations the size of all allocated stacks is the
 184      same.  As the very least there are only a few different sizes.
 185      Therefore this loop will exit early most of the time with an
 186      exact match.  */
 187   list_for_each (entry, &stack_cache)
 188     {
 189       struct pthread *curr;
 190
 191       curr = list_entry (entry, struct pthread, list);
 192       if (FREE_P (curr) && curr->stackblock_size >= size)
 193         {
 194           if (curr->stackblock_size == size)
 195             {
 196               result = curr;
 197               break;
 198             }
 199
 200           if (result == NULL
 201               || result->stackblock_size > curr->stackblock_size)
 202             result = curr;
 203         }
 204     }
 205
 206   if (__builtin_expect (result == NULL, 0)
 207       /* Make sure the size difference is not too excessive.  In that
 208          case we do not use the block.  */
 209       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 210     {
 211       /* Release the lock.  */
 212       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 213
 214       return NULL;
 215     }
 216
 217   /* Don't allow setxid until cloned.  */
 218   result->setxid_futex = -1;
 219
 220   /* Dequeue the entry.  */
 221   stack_list_del (&result->list);
 222
 223   /* And add to the list of stacks in use.  */
 224   stack_list_add (&result->list, &stack_used);
 225
 226   /* And decrease the cache size.  */
 227   stack_cache_actsize -= result->stackblock_size;
 228
 229   /* Release the lock early.  */
 230   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 231
 232   /* Report size and location of the stack to the caller.  */
 233   *sizep = result->stackblock_size;
 234   *memp = result->stackblock;
 235
 236   /* Cancellation handling is back to the default.  */
 237   result->cancelhandling = 0;
 238   result->cleanup = NULL;
 239
 240   /* No pending event.  */
 241   result->nextevent = NULL;
 242
 243   /* Clear the DTV.  */
 244   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 245   _dl_clear_dtv (dtv);
 246
 247   /* Re-initialize the TLS.  */
 248   _dl_allocate_tls_init (TLS_TPADJ (result));
 249
 250   return result;
 251 }
 252
 253
 254 /* Free stacks until cache size is lower than LIMIT.  */
 255 void
 256 __free_stacks (size_t limit)
 257 {
 258   /* We reduce the size of the cache.  Remove the last entries until
 259      the size is below the limit.  */
 260   list_t *entry;
 261   list_t *prev;
 262
 263   /* Search from the end of the list.  */
 264   list_for_each_prev_safe (entry, prev, &stack_cache)
 265     {
 266       struct pthread *curr;
 267
 268       curr = list_entry (entry, struct pthread, list);
 269       if (FREE_P (curr))
 270         {
 271           /* Unlink the block.  */
 272           stack_list_del (entry);
 273
 274           /* Account for the freed memory.  */
 275           stack_cache_actsize -= curr->stackblock_size;
 276
 277           /* Free the memory associated with the ELF TLS.  */
 278           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 279
 280           /* Remove this block.  This should never fail.  If it does
 281              something is really wrong.  */
 282           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 283             abort ();
 284
 285           /* Maybe we have freed enough.  */
 286           if (stack_cache_actsize <= limit)
 287             break;
 288         }
 289     }
 290 }
 291
 292
 293 /* Add a stack frame which is not used anymore to the stack.  Must be
 294    called with the cache lock held.  */
 295 static inline void
 296 __attribute ((always_inline))
 297 queue_stack (struct pthread *stack)
 298 {
 299   /* We unconditionally add the stack to the list.  The memory may
 300      still be in use but it will not be reused until the kernel marks
 301      the stack as not used anymore.  */
 302   stack_list_add (&stack->list, &stack_cache);
 303
 304   stack_cache_actsize += stack->stackblock_size;
 305   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 306     __free_stacks (stack_cache_maxsize);
 307 }
 308
 309
 310 static int
 311 internal_function
 312 change_stack_perm (struct pthread *pd
 313 #ifdef NEED_SEPARATE_REGISTER_STACK
 314                    , size_t pagemask
 315 #endif
 316                    )
 317 {
 318 #ifdef NEED_SEPARATE_REGISTER_STACK
 319   void *stack = (pd->stackblock
 320                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 321                       & pagemask) + pd->guardsize) & pagemask));
 322   size_t len = pd->stackblock + pd->stackblock_size - stack;
 323 #elif _STACK_GROWS_DOWN
 324   void *stack = pd->stackblock + pd->guardsize;
 325   size_t len = pd->stackblock_size - pd->guardsize;
 326 #elif _STACK_GROWS_UP
 327   void *stack = pd->stackblock;
 328   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 329 #else
 330 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 331 #endif
 332   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 333     return errno;
 334
 335   return 0;
 336 }
 337
 338
 339 /* Returns a usable stack for a new thread either by allocating a
 340    new stack or reusing a cached stack of sufficient size.
 341    ATTR must be non-NULL and point to a valid pthread_attr.
 342    PDP must be non-NULL.  */
 343 static int
 344 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 345                 ALLOCATE_STACK_PARMS)
 346 {
 347   struct pthread *pd;
 348   size_t size;
 349   size_t pagesize_m1 = __getpagesize () - 1;
 350   void *stacktop;
 351
 352   assert (powerof2 (pagesize_m1 + 1));
 353   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 354
 355   /* Get the stack size from the attribute if it is set.  Otherwise we
 356      use the default we determined at start time.  */
 357   if (attr->stacksize != 0)
 358     size = attr->stacksize;
 359   else
 360     {
 361       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 362       size = __default_pthread_attr.stacksize;
 363       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 364     }
 365
 366   /* Get memory for the stack.  */
 367   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 368     {
 369       uintptr_t adj;
 370
 371       /* If the user also specified the size of the stack make sure it
 372          is large enough.  */
 373       if (attr->stacksize != 0
 374           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 375         return EINVAL;
 376
 377       /* Adjust stack size for alignment of the TLS block.  */
 378 #if TLS_TCB_AT_TP
 379       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 380             & __static_tls_align_m1;
 381       assert (size > adj + TLS_TCB_SIZE);
 382 #elif TLS_DTV_AT_TP
 383       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 384             & __static_tls_align_m1;
 385       assert (size > adj);
 386 #endif
 387
 388       /* The user provided some memory.  Let's hope it matches the
 389          size...  We do not allocate guard pages if the user provided
 390          the stack.  It is the user's responsibility to do this if it
 391          is wanted.  */
 392 #if TLS_TCB_AT_TP
 393       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 394                                - TLS_TCB_SIZE - adj);
 395 #elif TLS_DTV_AT_TP
 396       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 397                                 - __static_tls_size - adj)
 398                                - TLS_PRE_TCB_SIZE);
 399 #endif
 400
 401       /* The user provided stack memory needs to be cleared.  */
 402       memset (pd, '\0', sizeof (struct pthread));
 403
 404       /* The first TSD block is included in the TCB.  */
 405       pd->specific[0] = pd->specific_1stblock;
 406
 407       /* Remember the stack-related values.  */
 408       pd->stackblock = (char *) attr->stackaddr - size;
 409       pd->stackblock_size = size;
 410
 411       /* This is a user-provided stack.  It will not be queued in the
 412          stack cache nor will the memory (except the TLS memory) be freed.  */
 413       pd->user_stack = true;
 414
 415       /* This is at least the second thread.  */
 416       pd->header.multiple_threads = 1;
 417 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 418       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 419 #endif
 420
 421 #ifndef __ASSUME_PRIVATE_FUTEX
 422       /* The thread must know when private futexes are supported.  */
 423       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 424                                                 header.private_futex);
 425 #endif
 426
 427 #ifdef NEED_DL_SYSINFO
 428       /* Copy the sysinfo value from the parent.  */
 429       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 430 #endif
 431
 432       /* The process ID is also the same as that of the caller.  */
 433       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 434
 435       /* Don't allow setxid until cloned.  */
 436       pd->setxid_futex = -1;
 437
 438       /* Allocate the DTV for this thread.  */
 439       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 440         {
 441           /* Something went wrong.  */
 442           assert (errno == ENOMEM);
 443           return errno;
 444         }
 445
 446
 447       /* Prepare to modify global data.  */
 448       lll_lock (stack_cache_lock, LLL_PRIVATE);
 449
 450       /* And add to the list of stacks in use.  */
 451       list_add (&pd->list, &__stack_user);
 452
 453       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 454     }
 455   else
 456     {
 457       /* Allocate some anonymous memory.  If possible use the cache.  */
 458       size_t guardsize;
 459       size_t reqsize;
 460       void *mem;
 461       const int prot = (PROT_READ | PROT_WRITE
 462                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 463
 464 #if COLORING_INCREMENT != 0
 465       /* Add one more page for stack coloring.  Don't do it for stacks
 466          with 16 times pagesize or larger.  This might just cause
 467          unnecessary misalignment.  */
 468       if (size <= 16 * pagesize_m1)
 469         size += pagesize_m1 + 1;
 470 #endif
 471
 472       /* Adjust the stack size for alignment.  */
 473       size &= ~__static_tls_align_m1;
 474       assert (size != 0);
 475
 476       /* Make sure the size of the stack is enough for the guard and
 477          eventually the thread descriptor.  */
 478       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 479       if (__builtin_expect (size < ((guardsize + __static_tls_size
 480                                      + MINIMAL_REST_STACK + pagesize_m1)
 481                                     & ~pagesize_m1),
 482                             0))
 483         /* The stack is too small (or the guard too large).  */
 484         return EINVAL;
 485
 486       /* Try to get a stack from the cache.  */
 487       reqsize = size;
 488       pd = get_cached_stack (&size, &mem);
 489       if (pd == NULL)
 490         {
 491           /* To avoid aliasing effects on a larger scale than pages we
 492              adjust the allocated stack size if necessary.  This way
 493              allocations directly following each other will not have
 494              aliasing problems.  */
 495 #if MULTI_PAGE_ALIASING != 0
 496           if ((size % MULTI_PAGE_ALIASING) == 0)
 497             size += pagesize_m1 + 1;
 498 #endif
 499
 500           mem = mmap (NULL, size, prot,
 501                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 502
 503           if (__builtin_expect (mem == MAP_FAILED, 0))
 504             return errno;
 505
 506           /* SIZE is guaranteed to be greater than zero.
 507              So we can never get a null pointer back from mmap.  */
 508           assert (mem != NULL);
 509
 510 #if COLORING_INCREMENT != 0
 511           /* Atomically increment NCREATED.  */
 512           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 513
 514           /* We chose the offset for coloring by incrementing it for
 515              every new thread by a fixed amount.  The offset used
 516              module the page size.  Even if coloring would be better
 517              relative to higher alignment values it makes no sense to
 518              do it since the mmap() interface does not allow us to
 519              specify any alignment for the returned memory block.  */
 520           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 521
 522           /* Make sure the coloring offsets does not disturb the alignment
 523              of the TCB and static TLS block.  */
 524           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 525             coloring = (((coloring + __static_tls_align_m1)
 526                          & ~(__static_tls_align_m1))
 527                         & ~pagesize_m1);
 528 #else
 529           /* Unless specified we do not make any adjustments.  */
 530 # define coloring 0
 531 #endif
 532
 533           /* Place the thread descriptor at the end of the stack.  */
 534 #if TLS_TCB_AT_TP
 535           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 536 #elif TLS_DTV_AT_TP
 537           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 538                                     - __static_tls_size)
 539                                     & ~__static_tls_align_m1)
 540                                    - TLS_PRE_TCB_SIZE);
 541 #endif
 542
 543           /* Remember the stack-related values.  */
 544           pd->stackblock = mem;
 545           pd->stackblock_size = size;
 546
 547           /* We allocated the first block thread-specific data array.
 548              This address will not change for the lifetime of this
 549              descriptor.  */
 550           pd->specific[0] = pd->specific_1stblock;
 551
 552           /* This is at least the second thread.  */
 553           pd->header.multiple_threads = 1;
 554 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 555           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 556 #endif
 557
 558 #ifndef __ASSUME_PRIVATE_FUTEX
 559           /* The thread must know when private futexes are supported.  */
 560           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 561                                                     header.private_futex);
 562 #endif
 563
 564 #ifdef NEED_DL_SYSINFO
 565           /* Copy the sysinfo value from the parent.  */
 566           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 567 #endif
 568
 569           /* Don't allow setxid until cloned.  */
 570           pd->setxid_futex = -1;
 571
 572           /* The process ID is also the same as that of the caller.  */
 573           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 574
 575           /* Allocate the DTV for this thread.  */
 576           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 577             {
 578               /* Something went wrong.  */
 579               assert (errno == ENOMEM);
 580
 581               /* Free the stack memory we just allocated.  */
 582               (void) munmap (mem, size);
 583
 584               return errno;
 585             }
 586
 587
 588           /* Prepare to modify global data.  */
 589           lll_lock (stack_cache_lock, LLL_PRIVATE);
 590
 591           /* And add to the list of stacks in use.  */
 592           stack_list_add (&pd->list, &stack_used);
 593
 594           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 595
 596
 597           /* There might have been a race.  Another thread might have
 598              caused the stacks to get exec permission while this new
 599              stack was prepared.  Detect if this was possible and
 600              change the permission if necessary.  */
 601           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 602                                 && (prot & PROT_EXEC) == 0, 0))
 603             {
 604               int err = change_stack_perm (pd
 605 #ifdef NEED_SEPARATE_REGISTER_STACK
 606                                            , ~pagesize_m1
 607 #endif
 608                                            );
 609               if (err != 0)
 610                 {
 611                   /* Free the stack memory we just allocated.  */
 612                   (void) munmap (mem, size);
 613
 614                   return err;
 615                 }
 616             }
 617
 618
 619           /* Note that all of the stack and the thread descriptor is
 620              zeroed.  This means we do not have to initialize fields
 621              with initial value zero.  This is specifically true for
 622              the 'tid' field which is always set back to zero once the
 623              stack is not used anymore and for the 'guardsize' field
 624              which will be read next.  */
 625         }
 626
 627       /* Create or resize the guard area if necessary.  */
 628       if (__builtin_expect (guardsize > pd->guardsize, 0))
 629         {
 630 #ifdef NEED_SEPARATE_REGISTER_STACK
 631           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 632 #elif _STACK_GROWS_DOWN
 633           char *guard = mem;
 634 # elif _STACK_GROWS_UP
 635           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 636 #endif
 637           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 638             {
 639             mprot_error:
 640               lll_lock (stack_cache_lock, LLL_PRIVATE);
 641
 642               /* Remove the thread from the list.  */
 643               stack_list_del (&pd->list);
 644
 645               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 646
 647               /* Get rid of the TLS block we allocated.  */
 648               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 649
 650               /* Free the stack memory regardless of whether the size
 651                  of the cache is over the limit or not.  If this piece
 652                  of memory caused problems we better do not use it
 653                  anymore.  Uh, and we ignore possible errors.  There
 654                  is nothing we could do.  */
 655               (void) munmap (mem, size);
 656
 657               return errno;
 658             }
 659
 660           pd->guardsize = guardsize;
 661         }
 662       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 663                                  0))
 664         {
 665           /* The old guard area is too large.  */
 666
 667 #ifdef NEED_SEPARATE_REGISTER_STACK
 668           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 669           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 670
 671           if (oldguard < guard
 672               && mprotect (oldguard, guard - oldguard, prot) != 0)
 673             goto mprot_error;
 674
 675           if (mprotect (guard + guardsize,
 676                         oldguard + pd->guardsize - guard - guardsize,
 677                         prot) != 0)
 678             goto mprot_error;
 679 #elif _STACK_GROWS_DOWN
 680           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 681                         prot) != 0)
 682             goto mprot_error;
 683 #elif _STACK_GROWS_UP
 684           if (mprotect ((char *) pd - pd->guardsize,
 685                         pd->guardsize - guardsize, prot) != 0)
 686             goto mprot_error;
 687 #endif
 688
 689           pd->guardsize = guardsize;
 690         }
 691       /* The pthread_getattr_np() calls need to get passed the size
 692          requested in the attribute, regardless of how large the
 693          actually used guardsize is.  */
 694       pd->reported_guardsize = guardsize;
 695     }
 696
 697   /* Initialize the lock.  We have to do this unconditionally since the
 698      stillborn thread could be canceled while the lock is taken.  */
 699   pd->lock = LLL_LOCK_INITIALIZER;
 700
 701   /* The robust mutex lists also need to be initialized
 702      unconditionally because the cleanup for the previous stack owner
 703      might have happened in the kernel.  */
 704   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 705                                   - offsetof (pthread_mutex_t,
 706                                               __data.__list.__next));
 707   pd->robust_head.list_op_pending = NULL;
 708 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 709   pd->robust_prev = &pd->robust_head;
 710 #endif
 711   pd->robust_head.list = &pd->robust_head;
 712
 713   /* We place the thread descriptor at the end of the stack.  */
 714   *pdp = pd;
 715
 716 #if TLS_TCB_AT_TP
 717   /* The stack begins before the TCB and the static TLS block.  */
 718   stacktop = ((char *) (pd + 1) - __static_tls_size);
 719 #elif TLS_DTV_AT_TP
 720   stacktop = (char *) (pd - 1);
 721 #endif
 722
 723 #ifdef NEED_SEPARATE_REGISTER_STACK
 724   *stack = pd->stackblock;
 725   *stacksize = stacktop - *stack;
 726 #elif _STACK_GROWS_DOWN
 727   *stack = stacktop;
 728 #elif _STACK_GROWS_UP
 729   *stack = pd->stackblock;
 730   assert (*stack > 0);
 731 #endif
 732
 733   return 0;
 734 }
 735
 736
 737 void
 738 internal_function
 739 __deallocate_stack (struct pthread *pd)
 740 {
 741   lll_lock (stack_cache_lock, LLL_PRIVATE);
 742
 743   /* Remove the thread from the list of threads with user defined
 744      stacks.  */
 745   stack_list_del (&pd->list);
 746
 747   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 748      not reset the 'used' flag in the 'tid' field.  This is done by
 749      the kernel.  If no thread has been created yet this field is
 750      still zero.  */
 751   if (__builtin_expect (! pd->user_stack, 1))
 752     (void) queue_stack (pd);
 753   else
 754     /* Free the memory associated with the ELF TLS.  */
 755     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 756
 757   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 758 }
 759
 760
 761 int
 762 internal_function
 763 __make_stacks_executable (void **stack_endp)
 764 {
 765   /* First the main thread's stack.  */
 766   int err = _dl_make_stack_executable (stack_endp);
 767   if (err != 0)
 768     return err;
 769
 770 #ifdef NEED_SEPARATE_REGISTER_STACK
 771   const size_t pagemask = ~(__getpagesize () - 1);
 772 #endif
 773
 774   lll_lock (stack_cache_lock, LLL_PRIVATE);
 775
 776   list_t *runp;
 777   list_for_each (runp, &stack_used)
 778     {
 779       err = change_stack_perm (list_entry (runp, struct pthread, list)
 780 #ifdef NEED_SEPARATE_REGISTER_STACK
 781                                , pagemask
 782 #endif
 783                                );
 784       if (err != 0)
 785         break;
 786     }
 787
 788   /* Also change the permission for the currently unused stacks.  This
 789      might be wasted time but better spend it here than adding a check
 790      in the fast path.  */
 791   if (err == 0)
 792     list_for_each (runp, &stack_cache)
 793       {
 794         err = change_stack_perm (list_entry (runp, struct pthread, list)
 795 #ifdef NEED_SEPARATE_REGISTER_STACK
 796                                  , pagemask
 797 #endif
 798                                  );
 799         if (err != 0)
 800           break;
 801       }
 802
 803   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 804
 805   return err;
 806 }
 807
 808
 809 /* In case of a fork() call the memory allocation in the child will be
 810    the same but only one thread is running.  All stacks except that of
 811    the one running thread are not used anymore.  We have to recycle
 812    them.  */
 813 void
 814 __reclaim_stacks (void)
 815 {
 816   struct pthread *self = (struct pthread *) THREAD_SELF;
 817
 818   /* No locking necessary.  The caller is the only stack in use.  But
 819      we have to be aware that we might have interrupted a list
 820      operation.  */
 821
 822   if (in_flight_stack != 0)
 823     {
 824       bool add_p = in_flight_stack & 1;
 825       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 826
 827       if (add_p)
 828         {
 829           /* We always add at the beginning of the list.  So in this
 830              case we only need to check the beginning of these lists.  */
 831           int check_list (list_t *l)
 832           {
 833             if (l->next->prev != l)
 834               {
 835                 assert (l->next->prev == elem);
 836
 837                 elem->next = l->next;
 838                 elem->prev = l;
 839                 l->next = elem;
 840
 841                 return 1;
 842               }
 843
 844             return 0;
 845           }
 846
 847           if (check_list (&stack_used) == 0)
 848             (void) check_list (&stack_cache);
 849         }
 850       else
 851         {
 852           /* We can simply always replay the delete operation.  */
 853           elem->next->prev = elem->prev;
 854           elem->prev->next = elem->next;
 855         }
 856     }
 857
 858   /* Mark all stacks except the still running one as free.  */
 859   list_t *runp;
 860   list_for_each (runp, &stack_used)
 861     {
 862       struct pthread *curp = list_entry (runp, struct pthread, list);
 863       if (curp != self)
 864         {
 865           /* This marks the stack as free.  */
 866           curp->tid = 0;
 867
 868           /* The PID field must be initialized for the new process.  */
 869           curp->pid = self->pid;
 870
 871           /* Account for the size of the stack.  */
 872           stack_cache_actsize += curp->stackblock_size;
 873
 874           if (curp->specific_used)
 875             {
 876               /* Clear the thread-specific data.  */
 877               memset (curp->specific_1stblock, '\0',
 878                       sizeof (curp->specific_1stblock));
 879
 880               curp->specific_used = false;
 881
 882               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 883                 if (curp->specific[cnt] != NULL)
 884                   {
 885                     memset (curp->specific[cnt], '\0',
 886                             sizeof (curp->specific_1stblock));
 887
 888                     /* We have allocated the block which we do not
 889                        free here so re-set the bit.  */
 890                     curp->specific_used = true;
 891                   }
 892             }
 893         }
 894     }
 895
 896   /* Reset the PIDs in any cached stacks.  */
 897   list_for_each (runp, &stack_cache)
 898     {
 899       struct pthread *curp = list_entry (runp, struct pthread, list);
 900       curp->pid = self->pid;
 901     }
 902
 903   /* Add the stack of all running threads to the cache.  */
 904   list_splice (&stack_used, &stack_cache);
 905
 906   /* Remove the entry for the current thread to from the cache list
 907      and add it to the list of running threads.  Which of the two
 908      lists is decided by the user_stack flag.  */
 909   stack_list_del (&self->list);
 910
 911   /* Re-initialize the lists for all the threads.  */
 912   INIT_LIST_HEAD (&stack_used);
 913   INIT_LIST_HEAD (&__stack_user);
 914
 915   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 916     list_add (&self->list, &__stack_user);
 917   else
 918     list_add (&self->list, &stack_used);
 919
 920   /* There is one thread running.  */
 921   __nptl_nthreads = 1;
 922
 923   in_flight_stack = 0;
 924
 925   /* Initialize locks.  */
 926   stack_cache_lock = LLL_LOCK_INITIALIZER;
 927   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 928 }
 929
 930
 931 #if HP_TIMING_AVAIL
 932 # undef __find_thread_by_id
 933 /* Find a thread given the thread ID.  */
 934 attribute_hidden
 935 struct pthread *
 936 __find_thread_by_id (pid_t tid)
 937 {
 938   struct pthread *result = NULL;
 939
 940   lll_lock (stack_cache_lock, LLL_PRIVATE);
 941
 942   /* Iterate over the list with system-allocated threads first.  */
 943   list_t *runp;
 944   list_for_each (runp, &stack_used)
 945     {
 946       struct pthread *curp;
 947
 948       curp = list_entry (runp, struct pthread, list);
 949
 950       if (curp->tid == tid)
 951         {
 952           result = curp;
 953           goto out;
 954         }
 955     }
 956
 957   /* Now the list with threads using user-allocated stacks.  */
 958   list_for_each (runp, &__stack_user)
 959     {
 960       struct pthread *curp;
 961
 962       curp = list_entry (runp, struct pthread, list);
 963
 964       if (curp->tid == tid)
 965         {
 966           result = curp;
 967           goto out;
 968         }
 969     }
 970
 971  out:
 972   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 973
 974   return result;
 975 }
 976 #endif
 977
 978
 979 static void
 980 internal_function
 981 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 982 {
 983   int ch;
 984
 985   /* Wait until this thread is cloned.  */
 986   if (t->setxid_futex == -1
 987       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 988     do
 989       lll_futex_wait (&t->setxid_futex, -2, LLL_PRIVATE);
 990     while (t->setxid_futex == -2);
 991
 992   /* Don't let the thread exit before the setxid handler runs.  */
 993   t->setxid_futex = 0;
 994
 995   do
 996     {
 997       ch = t->cancelhandling;
 998
 999       /* If the thread is exiting right now, ignore it.  */
1000       if ((ch & EXITING_BITMASK) != 0)
1001         {
1002           /* Release the futex if there is no other setxid in
1003              progress.  */
1004           if ((ch & SETXID_BITMASK) == 0)
1005             {
1006               t->setxid_futex = 1;
1007               lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1008             }
1009           return;
1010         }
1011     }
1012   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1013                                                ch | SETXID_BITMASK, ch));
1014 }
1015
1016
1017 static void
1018 internal_function
1019 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1020 {
1021   int ch;
1022
1023   do
1024     {
1025       ch = t->cancelhandling;
1026       if ((ch & SETXID_BITMASK) == 0)
1027         return;
1028     }
1029   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1030                                                ch & ~SETXID_BITMASK, ch));
1031
1032   /* Release the futex just in case.  */
1033   t->setxid_futex = 1;
1034   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1035 }
1036
1037
1038 static int
1039 internal_function
1040 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1041 {
1042   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1043     return 0;
1044
1045   int val;
1046   INTERNAL_SYSCALL_DECL (err);
1047   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1048                           t->tid, SIGSETXID);
1049
1050   /* If this failed, it must have had not started yet or else exited.  */
1051   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1052     {
1053       atomic_increment (&cmdp->cntr);
1054       return 1;
1055     }
1056   else
1057     return 0;
1058 }
1059
1060
1061 int
1062 attribute_hidden
1063 __nptl_setxid (struct xid_command *cmdp)
1064 {
1065   int signalled;
1066   int result;
1067   lll_lock (stack_cache_lock, LLL_PRIVATE);
1068
1069   __xidcmd = cmdp;
1070   cmdp->cntr = 0;
1071
1072   struct pthread *self = THREAD_SELF;
1073
1074   /* Iterate over the list with system-allocated threads first.  */
1075   list_t *runp;
1076   list_for_each (runp, &stack_used)
1077     {
1078       struct pthread *t = list_entry (runp, struct pthread, list);
1079       if (t == self)
1080         continue;
1081
1082       setxid_mark_thread (cmdp, t);
1083     }
1084
1085   /* Now the list with threads using user-allocated stacks.  */
1086   list_for_each (runp, &__stack_user)
1087     {
1088       struct pthread *t = list_entry (runp, struct pthread, list);
1089       if (t == self)
1090         continue;
1091
1092       setxid_mark_thread (cmdp, t);
1093     }
1094
1095   /* Iterate until we don't succeed in signalling anyone.  That means
1096      we have gotten all running threads, and their children will be
1097      automatically correct once started.  */
1098   do
1099     {
1100       signalled = 0;
1101
1102       list_for_each (runp, &stack_used)
1103         {
1104           struct pthread *t = list_entry (runp, struct pthread, list);
1105           if (t == self)
1106             continue;
1107
1108           signalled += setxid_signal_thread (cmdp, t);
1109         }
1110
1111       list_for_each (runp, &__stack_user)
1112         {
1113           struct pthread *t = list_entry (runp, struct pthread, list);
1114           if (t == self)
1115             continue;
1116
1117           signalled += setxid_signal_thread (cmdp, t);
1118         }
1119
1120       int cur = cmdp->cntr;
1121       while (cur != 0)
1122         {
1123           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1124           cur = cmdp->cntr;
1125         }
1126     }
1127   while (signalled != 0);
1128
1129   /* Clean up flags, so that no thread blocks during exit waiting
1130      for a signal which will never come.  */
1131   list_for_each (runp, &stack_used)
1132     {
1133       struct pthread *t = list_entry (runp, struct pthread, list);
1134       if (t == self)
1135         continue;
1136
1137       setxid_unmark_thread (cmdp, t);
1138     }
1139
1140   list_for_each (runp, &__stack_user)
1141     {
1142       struct pthread *t = list_entry (runp, struct pthread, list);
1143       if (t == self)
1144         continue;
1145
1146       setxid_unmark_thread (cmdp, t);
1147     }
1148
1149   /* This must be last, otherwise the current thread might not have
1150      permissions to send SIGSETXID syscall to the other threads.  */
1151   INTERNAL_SYSCALL_DECL (err);
1152   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1153                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1154   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1155     {
1156       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1157       result = -1;
1158     }
1159
1160   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1161   return result;
1162 }
1163
1164 static inline void __attribute__((always_inline))
1165 init_one_static_tls (struct pthread *curp, struct link_map *map)
1166 {
1167   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1168 # if TLS_TCB_AT_TP
1169   void *dest = (char *) curp - map->l_tls_offset;
1170 # elif TLS_DTV_AT_TP
1171   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1172 # else
1173 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1174 # endif
1175
1176   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1177   dtv[map->l_tls_modid].pointer.val = dest;
1178   dtv[map->l_tls_modid].pointer.is_static = true;
1179
1180   /* Initialize the memory.  */
1181   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1182           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1183 }
1184
1185 void
1186 attribute_hidden
1187 __pthread_init_static_tls (struct link_map *map)
1188 {
1189   lll_lock (stack_cache_lock, LLL_PRIVATE);
1190
1191   /* Iterate over the list with system-allocated threads first.  */
1192   list_t *runp;
1193   list_for_each (runp, &stack_used)
1194     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1195
1196   /* Now the list with threads using user-allocated stacks.  */
1197   list_for_each (runp, &__stack_user)
1198     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1199
1200   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1201 }
1202
1203
1204 void
1205 attribute_hidden
1206 __wait_lookup_done (void)
1207 {
1208   lll_lock (stack_cache_lock, LLL_PRIVATE);
1209
1210   struct pthread *self = THREAD_SELF;
1211
1212   /* Iterate over the list with system-allocated threads first.  */
1213   list_t *runp;
1214   list_for_each (runp, &stack_used)
1215     {
1216       struct pthread *t = list_entry (runp, struct pthread, list);
1217       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1218         continue;
1219
1220       int *const gscope_flagp = &t->header.gscope_flag;
1221
1222       /* We have to wait until this thread is done with the global
1223          scope.  First tell the thread that we are waiting and
1224          possibly have to be woken.  */
1225       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1226                                                 THREAD_GSCOPE_FLAG_WAIT,
1227                                                 THREAD_GSCOPE_FLAG_USED))
1228         continue;
1229
1230       do
1231         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1232       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1233     }
1234
1235   /* Now the list with threads using user-allocated stacks.  */
1236   list_for_each (runp, &__stack_user)
1237     {
1238       struct pthread *t = list_entry (runp, struct pthread, list);
1239       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1240         continue;
1241
1242       int *const gscope_flagp = &t->header.gscope_flag;
1243
1244       /* We have to wait until this thread is done with the global
1245          scope.  First tell the thread that we are waiting and
1246          possibly have to be woken.  */
1247       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1248                                                 THREAD_GSCOPE_FLAG_WAIT,
1249                                                 THREAD_GSCOPE_FLAG_USED))
1250         continue;
1251
1252       do
1253         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1254       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1255     }
1256
1257   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1258 }