mm/slub.c

   1 /*
   2  * SLUB: A slab allocator that limits cache line use instead of queuing
   3  * objects in per cpu and per node lists.
   4  *
   5  * The allocator synchronizes using per slab locks and only
   6  * uses a centralized lock to manage a pool of partial slabs.
   7  *
   8  * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
   9  */
  10
  11 #include <linux/mm.h>
  12 #include <linux/module.h>
  13 #include <linux/bit_spinlock.h>
  14 #include <linux/interrupt.h>
  15 #include <linux/bitops.h>
  16 #include <linux/slab.h>
  17 #include <linux/seq_file.h>
  18 #include <linux/cpu.h>
  19 #include <linux/cpuset.h>
  20 #include <linux/mempolicy.h>
  21 #include <linux/ctype.h>
  22 #include <linux/kallsyms.h>
  23
  24 /*
  25  * Lock order:
  26  *   1. slab_lock(page)
  27  *   2. slab->list_lock
  28  *
  29  *   The slab_lock protects operations on the object of a particular
  30  *   slab and its metadata in the page struct. If the slab lock
  31  *   has been taken then no allocations nor frees can be performed
  32  *   on the objects in the slab nor can the slab be added or removed
  33  *   from the partial or full lists since this would mean modifying
  34  *   the page_struct of the slab.
  35  *
  36  *   The list_lock protects the partial and full list on each node and
  37  *   the partial slab counter. If taken then no new slabs may be added or
  38  *   removed from the lists nor make the number of partial slabs be modified.
  39  *   (Note that the total number of slabs is an atomic value that may be
  40  *   modified without taking the list lock).
  41  *
  42  *   The list_lock is a centralized lock and thus we avoid taking it as
  43  *   much as possible. As long as SLUB does not have to handle partial
  44  *   slabs, operations can continue without any centralized lock. F.e.
  45  *   allocating a long series of objects that fill up slabs does not require
  46  *   the list lock.
  47  *
  48  *   The lock order is sometimes inverted when we are trying to get a slab
  49  *   off a list. We take the list_lock and then look for a page on the list
  50  *   to use. While we do that objects in the slabs may be freed. We can
  51  *   only operate on the slab if we have also taken the slab_lock. So we use
  52  *   a slab_trylock() on the slab. If trylock was successful then no frees
  53  *   can occur anymore and we can use the slab for allocations etc. If the
  54  *   slab_trylock() does not succeed then frees are in progress in the slab and
  55  *   we must stay away from it for a while since we may cause a bouncing
  56  *   cacheline if we try to acquire the lock. So go onto the next slab.
  57  *   If all pages are busy then we may allocate a new slab instead of reusing
  58  *   a partial slab. A new slab has noone operating on it and thus there is
  59  *   no danger of cacheline contention.
  60  *
  61  *   Interrupts are disabled during allocation and deallocation in order to
  62  *   make the slab allocator safe to use in the context of an irq. In addition
  63  *   interrupts are disabled to ensure that the processor does not change
  64  *   while handling per_cpu slabs, due to kernel preemption.
  65  *
  66  * SLUB assigns one slab for allocation to each processor.
  67  * Allocations only occur from these slabs called cpu slabs.
  68  *
  69  * Slabs with free elements are kept on a partial list.
  70  * There is no list for full slabs. If an object in a full slab is
  71  * freed then the slab will show up again on the partial lists.
  72  * Otherwise there is no need to track full slabs unless we have to
  73  * track full slabs for debugging purposes.
  74  *
  75  * Slabs are freed when they become empty. Teardown and setup is
  76  * minimal so we rely on the page allocators per cpu caches for
  77  * fast frees and allocs.
  78  *
  79  * Overloading of page flags that are otherwise used for LRU management.
  80  *
  81  * PageActive           The slab is used as a cpu cache. Allocations
  82  *                      may be performed from the slab. The slab is not
  83  *                      on any slab list and cannot be moved onto one.
  84  *
  85  * PageError            Slab requires special handling due to debug
  86  *                      options set. This moves slab handling out of
  87  *                      the fast path.
  88  */
  89
  90 /*
  91  * Issues still to be resolved:
  92  *
  93  * - The per cpu array is updated for each new slab and and is a remote
  94  *   cacheline for most nodes. This could become a bouncing cacheline given
  95  *   enough frequent updates. There are 16 pointers in a cacheline.so at
  96  *   max 16 cpus could compete. Likely okay.
  97  *
  98  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  99  *
 100  * - SLAB_DEBUG_INITIAL is not supported but I have never seen a use of
 101  *   it.
 102  *
 103  * - Variable sizing of the per node arrays
 104  */
 105
 106 /* Enable to test recovery from slab corruption on boot */
 107 #undef SLUB_RESILIENCY_TEST
 108
 109 #if PAGE_SHIFT <= 12
 110
 111 /*
 112  * Small page size. Make sure that we do not fragment memory
 113  */
 114 #define DEFAULT_MAX_ORDER 1
 115 #define DEFAULT_MIN_OBJECTS 4
 116
 117 #else
 118
 119 /*
 120  * Large page machines are customarily able to handle larger
 121  * page orders.
 122  */
 123 #define DEFAULT_MAX_ORDER 2
 124 #define DEFAULT_MIN_OBJECTS 8
 125
 126 #endif
 127
 128 /*
 129  * Flags from the regular SLAB that SLUB does not support:
 130  */
 131 #define SLUB_UNIMPLEMENTED (SLAB_DEBUG_INITIAL)
 132
 133 /*
 134  * Mininum number of partial slabs. These will be left on the partial
 135  * lists even if they are empty. kmem_cache_shrink may reclaim them.
 136  */
 137 #define MIN_PARTIAL 2
 138
 139 /*
 140  * Maximum number of desirable partial slabs.
 141  * The existence of more partial slabs makes kmem_cache_shrink
 142  * sort the partial list by the number of objects in the.
 143  */
 144 #define MAX_PARTIAL 10
 145
 146 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 147                                 SLAB_POISON | SLAB_STORE_USER)
 148 /*
 149  * Set of flags that will prevent slab merging
 150  */
 151 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 152                 SLAB_TRACE | SLAB_DESTROY_BY_RCU)
 153
 154 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 155                 SLAB_CACHE_DMA)
 156
 157 #ifndef ARCH_KMALLOC_MINALIGN
 158 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 159 #endif
 160
 161 #ifndef ARCH_SLAB_MINALIGN
 162 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 163 #endif
 164
 165 /* Internal SLUB flags */
 166 #define __OBJECT_POISON 0x80000000      /* Poison object */
 167
 168 static int kmem_size = sizeof(struct kmem_cache);
 169
 170 #ifdef CONFIG_SMP
 171 static struct notifier_block slab_notifier;
 172 #endif
 173
 174 static enum {
 175         DOWN,           /* No slab functionality available */
 176         PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
 177         UP,             /* Everything works */
 178         SYSFS           /* Sysfs up */
 179 } slab_state = DOWN;
 180
 181 /* A list of all slab caches on the system */
 182 static DECLARE_RWSEM(slub_lock);
 183 LIST_HEAD(slab_caches);
 184
 185 #ifdef CONFIG_SYSFS
 186 static int sysfs_slab_add(struct kmem_cache *);
 187 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 188 static void sysfs_slab_remove(struct kmem_cache *);
 189 #else
 190 static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 191 static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
 192 static void sysfs_slab_remove(struct kmem_cache *s) {}
 193 #endif
 194
 195 /********************************************************************
 196  *                      Core slab cache functions
 197  *******************************************************************/
 198
 199 int slab_is_available(void)
 200 {
 201         return slab_state >= UP;
 202 }
 203
 204 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 205 {
 206 #ifdef CONFIG_NUMA
 207         return s->node[node];
 208 #else
 209         return &s->local_node;
 210 #endif
 211 }
 212
 213 /*
 214  * Object debugging
 215  */
 216 static void print_section(char *text, u8 *addr, unsigned int length)
 217 {
 218         int i, offset;
 219         int newline = 1;
 220         char ascii[17];
 221
 222         ascii[16] = 0;
 223
 224         for (i = 0; i < length; i++) {
 225                 if (newline) {
 226                         printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
 227                         newline = 0;
 228                 }
 229                 printk(" %02x", addr[i]);
 230                 offset = i % 16;
 231                 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 232                 if (offset == 15) {
 233                         printk(" %s\n",ascii);
 234                         newline = 1;
 235                 }
 236         }
 237         if (!newline) {
 238                 i %= 16;
 239                 while (i < 16) {
 240                         printk("   ");
 241                         ascii[i] = ' ';
 242                         i++;
 243                 }
 244                 printk(" %s\n", ascii);
 245         }
 246 }
 247
 248 /*
 249  * Slow version of get and set free pointer.
 250  *
 251  * This requires touching the cache lines of kmem_cache.
 252  * The offset can also be obtained from the page. In that
 253  * case it is in the cacheline that we already need to touch.
 254  */
 255 static void *get_freepointer(struct kmem_cache *s, void *object)
 256 {
 257         return *(void **)(object + s->offset);
 258 }
 259
 260 static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 261 {
 262         *(void **)(object + s->offset) = fp;
 263 }
 264
 265 /*
 266  * Tracking user of a slab.
 267  */
 268 struct track {
 269         void *addr;             /* Called from address */
 270         int cpu;                /* Was running on cpu */
 271         int pid;                /* Pid context */
 272         unsigned long when;     /* When did the operation occur */
 273 };
 274
 275 enum track_item { TRACK_ALLOC, TRACK_FREE };
 276
 277 static struct track *get_track(struct kmem_cache *s, void *object,
 278         enum track_item alloc)
 279 {
 280         struct track *p;
 281
 282         if (s->offset)
 283                 p = object + s->offset + sizeof(void *);
 284         else
 285                 p = object + s->inuse;
 286
 287         return p + alloc;
 288 }
 289
 290 static void set_track(struct kmem_cache *s, void *object,
 291                                 enum track_item alloc, void *addr)
 292 {
 293         struct track *p;
 294
 295         if (s->offset)
 296                 p = object + s->offset + sizeof(void *);
 297         else
 298                 p = object + s->inuse;
 299
 300         p += alloc;
 301         if (addr) {
 302                 p->addr = addr;
 303                 p->cpu = smp_processor_id();
 304                 p->pid = current ? current->pid : -1;
 305                 p->when = jiffies;
 306         } else
 307                 memset(p, 0, sizeof(struct track));
 308 }
 309
 310 static void init_tracking(struct kmem_cache *s, void *object)
 311 {
 312         if (s->flags & SLAB_STORE_USER) {
 313                 set_track(s, object, TRACK_FREE, NULL);
 314                 set_track(s, object, TRACK_ALLOC, NULL);
 315         }
 316 }
 317
 318 static void print_track(const char *s, struct track *t)
 319 {
 320         if (!t->addr)
 321                 return;
 322
 323         printk(KERN_ERR "%s: ", s);
 324         __print_symbol("%s", (unsigned long)t->addr);
 325         printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
 326 }
 327
 328 static void print_trailer(struct kmem_cache *s, u8 *p)
 329 {
 330         unsigned int off;       /* Offset of last byte */
 331
 332         if (s->flags & SLAB_RED_ZONE)
 333                 print_section("Redzone", p + s->objsize,
 334                         s->inuse - s->objsize);
 335
 336         printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
 337                         p + s->offset,
 338                         get_freepointer(s, p));
 339
 340         if (s->offset)
 341                 off = s->offset + sizeof(void *);
 342         else
 343                 off = s->inuse;
 344
 345         if (s->flags & SLAB_STORE_USER) {
 346                 print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
 347                 print_track("Last free ", get_track(s, p, TRACK_FREE));
 348                 off += 2 * sizeof(struct track);
 349         }
 350
 351         if (off != s->size)
 352                 /* Beginning of the filler is the free pointer */
 353                 print_section("Filler", p + off, s->size - off);
 354 }
 355
 356 static void object_err(struct kmem_cache *s, struct page *page,
 357                         u8 *object, char *reason)
 358 {
 359         u8 *addr = page_address(page);
 360
 361         printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
 362                         s->name, reason, object, page);
 363         printk(KERN_ERR "    offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
 364                 object - addr, page->flags, page->inuse, page->freelist);
 365         if (object > addr + 16)
 366                 print_section("Bytes b4", object - 16, 16);
 367         print_section("Object", object, min(s->objsize, 128));
 368         print_trailer(s, object);
 369         dump_stack();
 370 }
 371
 372 static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
 373 {
 374         va_list args;
 375         char buf[100];
 376
 377         va_start(args, reason);
 378         vsnprintf(buf, sizeof(buf), reason, args);
 379         va_end(args);
 380         printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
 381                 page);
 382         dump_stack();
 383 }
 384
 385 static void init_object(struct kmem_cache *s, void *object, int active)
 386 {
 387         u8 *p = object;
 388
 389         if (s->flags & __OBJECT_POISON) {
 390                 memset(p, POISON_FREE, s->objsize - 1);
 391                 p[s->objsize -1] = POISON_END;
 392         }
 393
 394         if (s->flags & SLAB_RED_ZONE)
 395                 memset(p + s->objsize,
 396                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 397                         s->inuse - s->objsize);
 398 }
 399
 400 static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 401 {
 402         while (bytes) {
 403                 if (*start != (u8)value)
 404                         return 0;
 405                 start++;
 406                 bytes--;
 407         }
 408         return 1;
 409 }
 410
 411
 412 static int check_valid_pointer(struct kmem_cache *s, struct page *page,
 413                                          void *object)
 414 {
 415         void *base;
 416
 417         if (!object)
 418                 return 1;
 419
 420         base = page_address(page);
 421         if (object < base || object >= base + s->objects * s->size ||
 422                 (object - base) % s->size) {
 423                 return 0;
 424         }
 425
 426         return 1;
 427 }
 428
 429 /*
 430  * Object layout:
 431  *
 432  * object address
 433  *      Bytes of the object to be managed.
 434  *      If the freepointer may overlay the object then the free
 435  *      pointer is the first word of the object.
 436  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 437  *      0xa5 (POISON_END)
 438  *
 439  * object + s->objsize
 440  *      Padding to reach word boundary. This is also used for Redzoning.
 441  *      Padding is extended to word size if Redzoning is enabled
 442  *      and objsize == inuse.
 443  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 444  *      0xcc (RED_ACTIVE) for objects in use.
 445  *
 446  * object + s->inuse
 447  *      A. Free pointer (if we cannot overwrite object on free)
 448  *      B. Tracking data for SLAB_STORE_USER
 449  *      C. Padding to reach required alignment boundary
 450  *              Padding is done using 0x5a (POISON_INUSE)
 451  *
 452  * object + s->size
 453  *
 454  * If slabcaches are merged then the objsize and inuse boundaries are to
 455  * be ignored. And therefore no slab options that rely on these boundaries
 456  * may be used with merged slabcaches.
 457  */
 458
 459 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 460                                                 void *from, void *to)
 461 {
 462         printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
 463                 s->name, message, data, from, to - 1);
 464         memset(from, data, to - from);
 465 }
 466
 467 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 468 {
 469         unsigned long off = s->inuse;   /* The end of info */
 470
 471         if (s->offset)
 472                 /* Freepointer is placed after the object. */
 473                 off += sizeof(void *);
 474
 475         if (s->flags & SLAB_STORE_USER)
 476                 /* We also have user information there */
 477                 off += 2 * sizeof(struct track);
 478
 479         if (s->size == off)
 480                 return 1;
 481
 482         if (check_bytes(p + off, POISON_INUSE, s->size - off))
 483                 return 1;
 484
 485         object_err(s, page, p, "Object padding check fails");
 486
 487         /*
 488          * Restore padding
 489          */
 490         restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
 491         return 0;
 492 }
 493
 494 static int slab_pad_check(struct kmem_cache *s, struct page *page)
 495 {
 496         u8 *p;
 497         int length, remainder;
 498
 499         if (!(s->flags & SLAB_POISON))
 500                 return 1;
 501
 502         p = page_address(page);
 503         length = s->objects * s->size;
 504         remainder = (PAGE_SIZE << s->order) - length;
 505         if (!remainder)
 506                 return 1;
 507
 508         if (!check_bytes(p + length, POISON_INUSE, remainder)) {
 509                 slab_err(s, page, "Padding check failed");
 510                 restore_bytes(s, "slab padding", POISON_INUSE, p + length,
 511                         p + length + remainder);
 512                 return 0;
 513         }
 514         return 1;
 515 }
 516
 517 static int check_object(struct kmem_cache *s, struct page *page,
 518                                         void *object, int active)
 519 {
 520         u8 *p = object;
 521         u8 *endobject = object + s->objsize;
 522
 523         if (s->flags & SLAB_RED_ZONE) {
 524                 unsigned int red =
 525                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 526
 527                 if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
 528                         object_err(s, page, object,
 529                         active ? "Redzone Active" : "Redzone Inactive");
 530                         restore_bytes(s, "redzone", red,
 531                                 endobject, object + s->inuse);
 532                         return 0;
 533                 }
 534         } else {
 535                 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
 536                         !check_bytes(endobject, POISON_INUSE,
 537                                         s->inuse - s->objsize)) {
 538                 object_err(s, page, p, "Alignment padding check fails");
 539                 /*
 540                  * Fix it so that there will not be another report.
 541                  *
 542                  * Hmmm... We may be corrupting an object that now expects
 543                  * to be longer than allowed.
 544                  */
 545                 restore_bytes(s, "alignment padding", POISON_INUSE,
 546                         endobject, object + s->inuse);
 547                 }
 548         }
 549
 550         if (s->flags & SLAB_POISON) {
 551                 if (!active && (s->flags & __OBJECT_POISON) &&
 552                         (!check_bytes(p, POISON_FREE, s->objsize - 1) ||
 553                                 p[s->objsize - 1] != POISON_END)) {
 554
 555                         object_err(s, page, p, "Poison check failed");
 556                         restore_bytes(s, "Poison", POISON_FREE,
 557                                                 p, p + s->objsize -1);
 558                         restore_bytes(s, "Poison", POISON_END,
 559                                         p + s->objsize - 1, p + s->objsize);
 560                         return 0;
 561                 }
 562                 /*
 563                  * check_pad_bytes cleans up on its own.
 564                  */
 565                 check_pad_bytes(s, page, p);
 566         }
 567
 568         if (!s->offset && active)
 569                 /*
 570                  * Object and freepointer overlap. Cannot check
 571                  * freepointer while object is allocated.
 572                  */
 573                 return 1;
 574
 575         /* Check free pointer validity */
 576         if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
 577                 object_err(s, page, p, "Freepointer corrupt");
 578                 /*
 579                  * No choice but to zap it and thus loose the remainder
 580                  * of the free objects in this slab. May cause
 581                  * another error because the object count maybe
 582                  * wrong now.
 583                  */
 584                 set_freepointer(s, p, NULL);
 585                 return 0;
 586         }
 587         return 1;
 588 }
 589
 590 static int check_slab(struct kmem_cache *s, struct page *page)
 591 {
 592         VM_BUG_ON(!irqs_disabled());
 593
 594         if (!PageSlab(page)) {
 595                 slab_err(s, page, "Not a valid slab page flags=%lx "
 596                         "mapping=0x%p count=%d", page->flags, page->mapping,
 597                         page_count(page));
 598                 return 0;
 599         }
 600         if (page->offset * sizeof(void *) != s->offset) {
 601                 slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
 602                         "mapping=0x%p count=%d",
 603                         (unsigned long)(page->offset * sizeof(void *)),
 604                         page->flags,
 605                         page->mapping,
 606                         page_count(page));
 607                 return 0;
 608         }
 609         if (page->inuse > s->objects) {
 610                 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
 611                         "mapping=0x%p count=%d",
 612                         s->name, page->inuse, s->objects, page->flags,
 613                         page->mapping, page_count(page));
 614                 return 0;
 615         }
 616         /* Slab_pad_check fixes things up after itself */
 617         slab_pad_check(s, page);
 618         return 1;
 619 }
 620
 621 /*
 622  * Determine if a certain object on a page is on the freelist and
 623  * therefore free. Must hold the slab lock for cpu slabs to
 624  * guarantee that the chains are consistent.
 625  */
 626 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 627 {
 628         int nr = 0;
 629         void *fp = page->freelist;
 630         void *object = NULL;
 631
 632         while (fp && nr <= s->objects) {
 633                 if (fp == search)
 634                         return 1;
 635                 if (!check_valid_pointer(s, page, fp)) {
 636                         if (object) {
 637                                 object_err(s, page, object,
 638                                         "Freechain corrupt");
 639                                 set_freepointer(s, object, NULL);
 640                                 break;
 641                         } else {
 642                                 slab_err(s, page, "Freepointer 0x%p corrupt",
 643                                                                         fp);
 644                                 page->freelist = NULL;
 645                                 page->inuse = s->objects;
 646                                 printk(KERN_ERR "@@@ SLUB %s: Freelist "
 647                                         "cleared. Slab 0x%p\n",
 648                                         s->name, page);
 649                                 return 0;
 650                         }
 651                         break;
 652                 }
 653                 object = fp;
 654                 fp = get_freepointer(s, object);
 655                 nr++;
 656         }
 657
 658         if (page->inuse != s->objects - nr) {
 659                 slab_err(s, page, "Wrong object count. Counter is %d but "
 660                         "counted were %d", s, page, page->inuse,
 661                                                         s->objects - nr);
 662                 page->inuse = s->objects - nr;
 663                 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
 664                         "Slab @0x%p\n", s->name, page);
 665         }
 666         return search == NULL;
 667 }
 668
 669 /*
 670  * Tracking of fully allocated slabs for debugging
 671  */
 672 static void add_full(struct kmem_cache_node *n, struct page *page)
 673 {
 674         spin_lock(&n->list_lock);
 675         list_add(&page->lru, &n->full);
 676         spin_unlock(&n->list_lock);
 677 }
 678
 679 static void remove_full(struct kmem_cache *s, struct page *page)
 680 {
 681         struct kmem_cache_node *n;
 682
 683         if (!(s->flags & SLAB_STORE_USER))
 684                 return;
 685
 686         n = get_node(s, page_to_nid(page));
 687
 688         spin_lock(&n->list_lock);
 689         list_del(&page->lru);
 690         spin_unlock(&n->list_lock);
 691 }
 692
 693 static int alloc_object_checks(struct kmem_cache *s, struct page *page,
 694                                                         void *object)
 695 {
 696         if (!check_slab(s, page))
 697                 goto bad;
 698
 699         if (object && !on_freelist(s, page, object)) {
 700                 slab_err(s, page, "Object 0x%p already allocated", object);
 701                 goto bad;
 702         }
 703
 704         if (!check_valid_pointer(s, page, object)) {
 705                 object_err(s, page, object, "Freelist Pointer check fails");
 706                 goto bad;
 707         }
 708
 709         if (!object)
 710                 return 1;
 711
 712         if (!check_object(s, page, object, 0))
 713                 goto bad;
 714
 715         return 1;
 716 bad:
 717         if (PageSlab(page)) {
 718                 /*
 719                  * If this is a slab page then lets do the best we can
 720                  * to avoid issues in the future. Marking all objects
 721                  * as used avoids touching the remainder.
 722                  */
 723                 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
 724                         s->name, page);
 725                 page->inuse = s->objects;
 726                 page->freelist = NULL;
 727                 /* Fix up fields that may be corrupted */
 728                 page->offset = s->offset / sizeof(void *);
 729         }
 730         return 0;
 731 }
 732
 733 static int free_object_checks(struct kmem_cache *s, struct page *page,
 734                                                         void *object)
 735 {
 736         if (!check_slab(s, page))
 737                 goto fail;
 738
 739         if (!check_valid_pointer(s, page, object)) {
 740                 slab_err(s, page, "Invalid object pointer 0x%p", object);
 741                 goto fail;
 742         }
 743
 744         if (on_freelist(s, page, object)) {
 745                 slab_err(s, page, "Object 0x%p already free", object);
 746                 goto fail;
 747         }
 748
 749         if (!check_object(s, page, object, 1))
 750                 return 0;
 751
 752         if (unlikely(s != page->slab)) {
 753                 if (!PageSlab(page))
 754                         slab_err(s, page, "Attempt to free object(0x%p) "
 755                                 "outside of slab", object);
 756                 else
 757                 if (!page->slab) {
 758                         printk(KERN_ERR
 759                                 "SLUB <none>: no slab for object 0x%p.\n",
 760                                                 object);
 761                         dump_stack();
 762                 }
 763                 else
 764                         slab_err(s, page, "object at 0x%p belongs "
 765                                 "to slab %s", object, page->slab->name);
 766                 goto fail;
 767         }
 768         return 1;
 769 fail:
 770         printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
 771                 s->name, page, object);
 772         return 0;
 773 }
 774
 775 /*
 776  * Slab allocation and freeing
 777  */
 778 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 779 {
 780         struct page * page;
 781         int pages = 1 << s->order;
 782
 783         if (s->order)
 784                 flags |= __GFP_COMP;
 785
 786         if (s->flags & SLAB_CACHE_DMA)
 787                 flags |= SLUB_DMA;
 788
 789         if (node == -1)
 790                 page = alloc_pages(flags, s->order);
 791         else
 792                 page = alloc_pages_node(node, flags, s->order);
 793
 794         if (!page)
 795                 return NULL;
 796
 797         mod_zone_page_state(page_zone(page),
 798                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 799                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 800                 pages);
 801
 802         return page;
 803 }
 804
 805 static void setup_object(struct kmem_cache *s, struct page *page,
 806                                 void *object)
 807 {
 808         if (PageError(page)) {
 809                 init_object(s, object, 0);
 810                 init_tracking(s, object);
 811         }
 812
 813         if (unlikely(s->ctor)) {
 814                 int mode = SLAB_CTOR_CONSTRUCTOR;
 815
 816                 if (!(s->flags & __GFP_WAIT))
 817                         mode |= SLAB_CTOR_ATOMIC;
 818
 819                 s->ctor(object, s, mode);
 820         }
 821 }
 822
 823 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 824 {
 825         struct page *page;
 826         struct kmem_cache_node *n;
 827         void *start;
 828         void *end;
 829         void *last;
 830         void *p;
 831
 832         if (flags & __GFP_NO_GROW)
 833                 return NULL;
 834
 835         BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
 836
 837         if (flags & __GFP_WAIT)
 838                 local_irq_enable();
 839
 840         page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
 841         if (!page)
 842                 goto out;
 843
 844         n = get_node(s, page_to_nid(page));
 845         if (n)
 846                 atomic_long_inc(&n->nr_slabs);
 847         page->offset = s->offset / sizeof(void *);
 848         page->slab = s;
 849         page->flags |= 1 << PG_slab;
 850         if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
 851                         SLAB_STORE_USER | SLAB_TRACE))
 852                 page->flags |= 1 << PG_error;
 853
 854         start = page_address(page);
 855         end = start + s->objects * s->size;
 856
 857         if (unlikely(s->flags & SLAB_POISON))
 858                 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
 859
 860         last = start;
 861         for (p = start + s->size; p < end; p += s->size) {
 862                 setup_object(s, page, last);
 863                 set_freepointer(s, last, p);
 864                 last = p;
 865         }
 866         setup_object(s, page, last);
 867         set_freepointer(s, last, NULL);
 868
 869         page->freelist = start;
 870         page->inuse = 0;
 871 out:
 872         if (flags & __GFP_WAIT)
 873                 local_irq_disable();
 874         return page;
 875 }
 876
 877 static void __free_slab(struct kmem_cache *s, struct page *page)
 878 {
 879         int pages = 1 << s->order;
 880
 881         if (unlikely(PageError(page) || s->dtor)) {
 882                 void *start = page_address(page);
 883                 void *end = start + (pages << PAGE_SHIFT);
 884                 void *p;
 885
 886                 slab_pad_check(s, page);
 887                 for (p = start; p <= end - s->size; p += s->size) {
 888                         if (s->dtor)
 889                                 s->dtor(p, s, 0);
 890                         check_object(s, page, p, 0);
 891                 }
 892         }
 893
 894         mod_zone_page_state(page_zone(page),
 895                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 896                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 897                 - pages);
 898
 899         page->mapping = NULL;
 900         __free_pages(page, s->order);
 901 }
 902
 903 static void rcu_free_slab(struct rcu_head *h)
 904 {
 905         struct page *page;
 906
 907         page = container_of((struct list_head *)h, struct page, lru);
 908         __free_slab(page->slab, page);
 909 }
 910
 911 static void free_slab(struct kmem_cache *s, struct page *page)
 912 {
 913         if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
 914                 /*
 915                  * RCU free overloads the RCU head over the LRU
 916                  */
 917                 struct rcu_head *head = (void *)&page->lru;
 918
 919                 call_rcu(head, rcu_free_slab);
 920         } else
 921                 __free_slab(s, page);
 922 }
 923
 924 static void discard_slab(struct kmem_cache *s, struct page *page)
 925 {
 926         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 927
 928         atomic_long_dec(&n->nr_slabs);
 929         reset_page_mapcount(page);
 930         page->flags &= ~(1 << PG_slab | 1 << PG_error);
 931         free_slab(s, page);
 932 }
 933
 934 /*
 935  * Per slab locking using the pagelock
 936  */
 937 static __always_inline void slab_lock(struct page *page)
 938 {
 939         bit_spin_lock(PG_locked, &page->flags);
 940 }
 941
 942 static __always_inline void slab_unlock(struct page *page)
 943 {
 944         bit_spin_unlock(PG_locked, &page->flags);
 945 }
 946
 947 static __always_inline int slab_trylock(struct page *page)
 948 {
 949         int rc = 1;
 950
 951         rc = bit_spin_trylock(PG_locked, &page->flags);
 952         return rc;
 953 }
 954
 955 /*
 956  * Management of partially allocated slabs
 957  */
 958 static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
 959 {
 960         spin_lock(&n->list_lock);
 961         n->nr_partial++;
 962         list_add_tail(&page->lru, &n->partial);
 963         spin_unlock(&n->list_lock);
 964 }
 965
 966 static void add_partial(struct kmem_cache_node *n, struct page *page)
 967 {
 968         spin_lock(&n->list_lock);
 969         n->nr_partial++;
 970         list_add(&page->lru, &n->partial);
 971         spin_unlock(&n->list_lock);
 972 }
 973
 974 static void remove_partial(struct kmem_cache *s,
 975                                                 struct page *page)
 976 {
 977         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 978
 979         spin_lock(&n->list_lock);
 980         list_del(&page->lru);
 981         n->nr_partial--;
 982         spin_unlock(&n->list_lock);
 983 }
 984
 985 /*
 986  * Lock page and remove it from the partial list
 987  *
 988  * Must hold list_lock
 989  */
 990 static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
 991 {
 992         if (slab_trylock(page)) {
 993                 list_del(&page->lru);
 994                 n->nr_partial--;
 995                 return 1;
 996         }
 997         return 0;
 998 }
 999
1000 /*
1001  * Try to get a partial slab from a specific node
1002  */
1003 static struct page *get_partial_node(struct kmem_cache_node *n)
1004 {
1005         struct page *page;
1006
1007         /*
1008          * Racy check. If we mistakenly see no partial slabs then we
1009          * just allocate an empty slab. If we mistakenly try to get a
1010          * partial slab then get_partials() will return NULL.
1011          */
1012         if (!n || !n->nr_partial)
1013                 return NULL;
1014
1015         spin_lock(&n->list_lock);
1016         list_for_each_entry(page, &n->partial, lru)
1017                 if (lock_and_del_slab(n, page))
1018                         goto out;
1019         page = NULL;
1020 out:
1021         spin_unlock(&n->list_lock);
1022         return page;
1023 }
1024
1025 /*
1026  * Get a page from somewhere. Search in increasing NUMA
1027  * distances.
1028  */
1029 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1030 {
1031 #ifdef CONFIG_NUMA
1032         struct zonelist *zonelist;
1033         struct zone **z;
1034         struct page *page;
1035
1036         /*
1037          * The defrag ratio allows to configure the tradeoffs between
1038          * inter node defragmentation and node local allocations.
1039          * A lower defrag_ratio increases the tendency to do local
1040          * allocations instead of scanning throught the partial
1041          * lists on other nodes.
1042          *
1043          * If defrag_ratio is set to 0 then kmalloc() always
1044          * returns node local objects. If its higher then kmalloc()
1045          * may return off node objects in order to avoid fragmentation.
1046          *
1047          * A higher ratio means slabs may be taken from other nodes
1048          * thus reducing the number of partial slabs on those nodes.
1049          *
1050          * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
1051          * defrag_ratio = 1000) then every (well almost) allocation
1052          * will first attempt to defrag slab caches on other nodes. This
1053          * means scanning over all nodes to look for partial slabs which
1054          * may be a bit expensive to do on every slab allocation.
1055          */
1056         if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
1057                 return NULL;
1058
1059         zonelist = &NODE_DATA(slab_node(current->mempolicy))
1060                                         ->node_zonelists[gfp_zone(flags)];
1061         for (z = zonelist->zones; *z; z++) {
1062                 struct kmem_cache_node *n;
1063
1064                 n = get_node(s, zone_to_nid(*z));
1065
1066                 if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
1067                                 n->nr_partial > MIN_PARTIAL) {
1068                         page = get_partial_node(n);
1069                         if (page)
1070                                 return page;
1071                 }
1072         }
1073 #endif
1074         return NULL;
1075 }
1076
1077 /*
1078  * Get a partial page, lock it and return it.
1079  */
1080 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1081 {
1082         struct page *page;
1083         int searchnode = (node == -1) ? numa_node_id() : node;
1084
1085         page = get_partial_node(get_node(s, searchnode));
1086         if (page || (flags & __GFP_THISNODE))
1087                 return page;
1088
1089         return get_any_partial(s, flags);
1090 }
1091
1092 /*
1093  * Move a page back to the lists.
1094  *
1095  * Must be called with the slab lock held.
1096  *
1097  * On exit the slab lock will have been dropped.
1098  */
1099 static void putback_slab(struct kmem_cache *s, struct page *page)
1100 {
1101         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1102
1103         if (page->inuse) {
1104
1105                 if (page->freelist)
1106                         add_partial(n, page);
1107                 else if (PageError(page) && (s->flags & SLAB_STORE_USER))
1108                         add_full(n, page);
1109                 slab_unlock(page);
1110
1111         } else {
1112                 if (n->nr_partial < MIN_PARTIAL) {
1113                         /*
1114                          * Adding an empty page to the partial slabs in order
1115                          * to avoid page allocator overhead. This page needs to
1116                          * come after all the others that are not fully empty
1117                          * in order to make sure that we do maximum
1118                          * defragmentation.
1119                          */
1120                         add_partial_tail(n, page);
1121                         slab_unlock(page);
1122                 } else {
1123                         slab_unlock(page);
1124                         discard_slab(s, page);
1125                 }
1126         }
1127 }
1128
1129 /*
1130  * Remove the cpu slab
1131  */
1132 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
1133 {
1134         s->cpu_slab[cpu] = NULL;
1135         ClearPageActive(page);
1136
1137         putback_slab(s, page);
1138 }
1139
1140 static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
1141 {
1142         slab_lock(page);
1143         deactivate_slab(s, page, cpu);
1144 }
1145
1146 /*
1147  * Flush cpu slab.
1148  * Called from IPI handler with interrupts disabled.
1149  */
1150 static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1151 {
1152         struct page *page = s->cpu_slab[cpu];
1153
1154         if (likely(page))
1155                 flush_slab(s, page, cpu);
1156 }
1157
1158 static void flush_cpu_slab(void *d)
1159 {
1160         struct kmem_cache *s = d;
1161         int cpu = smp_processor_id();
1162
1163         __flush_cpu_slab(s, cpu);
1164 }
1165
1166 static void flush_all(struct kmem_cache *s)
1167 {
1168 #ifdef CONFIG_SMP
1169         on_each_cpu(flush_cpu_slab, s, 1, 1);
1170 #else
1171         unsigned long flags;
1172
1173         local_irq_save(flags);
1174         flush_cpu_slab(s);
1175         local_irq_restore(flags);
1176 #endif
1177 }
1178
1179 /*
1180  * slab_alloc is optimized to only modify two cachelines on the fast path
1181  * (aside from the stack):
1182  *
1183  * 1. The page struct
1184  * 2. The first cacheline of the object to be allocated.
1185  *
1186  * The only cache lines that are read (apart from code) is the
1187  * per cpu array in the kmem_cache struct.
1188  *
1189  * Fastpath is not possible if we need to get a new slab or have
1190  * debugging enabled (which means all slabs are marked with PageError)
1191  */
1192 static void *slab_alloc(struct kmem_cache *s,
1193                                 gfp_t gfpflags, int node, void *addr)
1194 {
1195         struct page *page;
1196         void **object;
1197         unsigned long flags;
1198         int cpu;
1199
1200         local_irq_save(flags);
1201         cpu = smp_processor_id();
1202         page = s->cpu_slab[cpu];
1203         if (!page)
1204                 goto new_slab;
1205
1206         slab_lock(page);
1207         if (unlikely(node != -1 && page_to_nid(page) != node))
1208                 goto another_slab;
1209 redo:
1210         object = page->freelist;
1211         if (unlikely(!object))
1212                 goto another_slab;
1213         if (unlikely(PageError(page)))
1214                 goto debug;
1215
1216 have_object:
1217         page->inuse++;
1218         page->freelist = object[page->offset];
1219         slab_unlock(page);
1220         local_irq_restore(flags);
1221         return object;
1222
1223 another_slab:
1224         deactivate_slab(s, page, cpu);
1225
1226 new_slab:
1227         page = get_partial(s, gfpflags, node);
1228         if (likely(page)) {
1229 have_slab:
1230                 s->cpu_slab[cpu] = page;
1231                 SetPageActive(page);
1232                 goto redo;
1233         }
1234
1235         page = new_slab(s, gfpflags, node);
1236         if (page) {
1237                 cpu = smp_processor_id();
1238                 if (s->cpu_slab[cpu]) {
1239                         /*
1240                          * Someone else populated the cpu_slab while we enabled
1241                          * interrupts, or we have got scheduled on another cpu.
1242                          * The page may not be on the requested node.
1243                          */
1244                         if (node == -1 ||
1245                                 page_to_nid(s->cpu_slab[cpu]) == node) {
1246                                 /*
1247                                  * Current cpuslab is acceptable and we
1248                                  * want the current one since its cache hot
1249                                  */
1250                                 discard_slab(s, page);
1251                                 page = s->cpu_slab[cpu];
1252                                 slab_lock(page);
1253                                 goto redo;
1254                         }
1255                         /* Dump the current slab */
1256                         flush_slab(s, s->cpu_slab[cpu], cpu);
1257                 }
1258                 slab_lock(page);
1259                 goto have_slab;
1260         }
1261         local_irq_restore(flags);
1262         return NULL;
1263 debug:
1264         if (!alloc_object_checks(s, page, object))
1265                 goto another_slab;
1266         if (s->flags & SLAB_STORE_USER)
1267                 set_track(s, object, TRACK_ALLOC, addr);
1268         if (s->flags & SLAB_TRACE) {
1269                 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
1270                         s->name, object, page->inuse,
1271                         page->freelist);
1272                 dump_stack();
1273         }
1274         init_object(s, object, 1);
1275         goto have_object;
1276 }
1277
1278 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1279 {
1280         return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
1281 }
1282 EXPORT_SYMBOL(kmem_cache_alloc);
1283
1284 #ifdef CONFIG_NUMA
1285 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1286 {
1287         return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
1288 }
1289 EXPORT_SYMBOL(kmem_cache_alloc_node);
1290 #endif
1291
1292 /*
1293  * The fastpath only writes the cacheline of the page struct and the first
1294  * cacheline of the object.
1295  *
1296  * No special cachelines need to be read
1297  */
1298 static void slab_free(struct kmem_cache *s, struct page *page,
1299                                         void *x, void *addr)
1300 {
1301         void *prior;
1302         void **object = (void *)x;
1303         unsigned long flags;
1304
1305         local_irq_save(flags);
1306         slab_lock(page);
1307
1308         if (unlikely(PageError(page)))
1309                 goto debug;
1310 checks_ok:
1311         prior = object[page->offset] = page->freelist;
1312         page->freelist = object;
1313         page->inuse--;
1314
1315         if (unlikely(PageActive(page)))
1316                 /*
1317                  * Cpu slabs are never on partial lists and are
1318                  * never freed.
1319                  */
1320                 goto out_unlock;
1321
1322         if (unlikely(!page->inuse))
1323                 goto slab_empty;
1324
1325         /*
1326          * Objects left in the slab. If it
1327          * was not on the partial list before
1328          * then add it.
1329          */
1330         if (unlikely(!prior))
1331                 add_partial(get_node(s, page_to_nid(page)), page);
1332
1333 out_unlock:
1334         slab_unlock(page);
1335         local_irq_restore(flags);
1336         return;
1337
1338 slab_empty:
1339         if (prior)
1340                 /*
1341                  * Slab on the partial list.
1342                  */
1343                 remove_partial(s, page);
1344
1345         slab_unlock(page);
1346         discard_slab(s, page);
1347         local_irq_restore(flags);
1348         return;
1349
1350 debug:
1351         if (!free_object_checks(s, page, x))
1352                 goto out_unlock;
1353         if (!PageActive(page) && !page->freelist)
1354                 remove_full(s, page);
1355         if (s->flags & SLAB_STORE_USER)
1356                 set_track(s, x, TRACK_FREE, addr);
1357         if (s->flags & SLAB_TRACE) {
1358                 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
1359                         s->name, object, page->inuse,
1360                         page->freelist);
1361                 print_section("Object", (void *)object, s->objsize);
1362                 dump_stack();
1363         }
1364         init_object(s, object, 0);
1365         goto checks_ok;
1366 }
1367
1368 void kmem_cache_free(struct kmem_cache *s, void *x)
1369 {
1370         struct page *page;
1371
1372         page = virt_to_head_page(x);
1373
1374         slab_free(s, page, x, __builtin_return_address(0));
1375 }
1376 EXPORT_SYMBOL(kmem_cache_free);
1377
1378 /* Figure out on which slab object the object resides */
1379 static struct page *get_object_page(const void *x)
1380 {
1381         struct page *page = virt_to_head_page(x);
1382
1383         if (!PageSlab(page))
1384                 return NULL;
1385
1386         return page;
1387 }
1388
1389 /*
1390  * kmem_cache_open produces objects aligned at "size" and the first object
1391  * is placed at offset 0 in the slab (We have no metainformation on the
1392  * slab, all slabs are in essence "off slab").
1393  *
1394  * In order to get the desired alignment one just needs to align the
1395  * size.
1396  *
1397  * Notice that the allocation order determines the sizes of the per cpu
1398  * caches. Each processor has always one slab available for allocations.
1399  * Increasing the allocation order reduces the number of times that slabs
1400  * must be moved on and off the partial lists and therefore may influence
1401  * locking overhead.
1402  *
1403  * The offset is used to relocate the free list link in each object. It is
1404  * therefore possible to move the free list link behind the object. This
1405  * is necessary for RCU to work properly and also useful for debugging.
1406  */
1407
1408 /*
1409  * Mininum / Maximum order of slab pages. This influences locking overhead
1410  * and slab fragmentation. A higher order reduces the number of partial slabs
1411  * and increases the number of allocations possible without having to
1412  * take the list_lock.
1413  */
1414 static int slub_min_order;
1415 static int slub_max_order = DEFAULT_MAX_ORDER;
1416
1417 /*
1418  * Minimum number of objects per slab. This is necessary in order to
1419  * reduce locking overhead. Similar to the queue size in SLAB.
1420  */
1421 static int slub_min_objects = DEFAULT_MIN_OBJECTS;
1422
1423 /*
1424  * Merge control. If this is set then no merging of slab caches will occur.
1425  */
1426 static int slub_nomerge;
1427
1428 /*
1429  * Debug settings:
1430  */
1431 static int slub_debug;
1432
1433 static char *slub_debug_slabs;
1434
1435 /*
1436  * Calculate the order of allocation given an slab object size.
1437  *
1438  * The order of allocation has significant impact on other elements
1439  * of the system. Generally order 0 allocations should be preferred
1440  * since they do not cause fragmentation in the page allocator. Larger
1441  * objects may have problems with order 0 because there may be too much
1442  * space left unused in a slab. We go to a higher order if more than 1/8th
1443  * of the slab would be wasted.
1444  *
1445  * In order to reach satisfactory performance we must ensure that
1446  * a minimum number of objects is in one slab. Otherwise we may
1447  * generate too much activity on the partial lists. This is less a
1448  * concern for large slabs though. slub_max_order specifies the order
1449  * where we begin to stop considering the number of objects in a slab.
1450  *
1451  * Higher order allocations also allow the placement of more objects
1452  * in a slab and thereby reduce object handling overhead. If the user
1453  * has requested a higher mininum order then we start with that one
1454  * instead of zero.
1455  */
1456 static int calculate_order(int size)
1457 {
1458         int order;
1459         int rem;
1460
1461         for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
1462                         order < MAX_ORDER; order++) {
1463                 unsigned long slab_size = PAGE_SIZE << order;
1464
1465                 if (slub_max_order > order &&
1466                                 slab_size < slub_min_objects * size)
1467                         continue;
1468
1469                 if (slab_size < size)
1470                         continue;
1471
1472                 rem = slab_size % size;
1473
1474                 if (rem <= (PAGE_SIZE << order) / 8)
1475                         break;
1476
1477         }
1478         if (order >= MAX_ORDER)
1479                 return -E2BIG;
1480         return order;
1481 }
1482
1483 /*
1484  * Function to figure out which alignment to use from the
1485  * various ways of specifying it.
1486  */
1487 static unsigned long calculate_alignment(unsigned long flags,
1488                 unsigned long align, unsigned long size)
1489 {
1490         /*
1491          * If the user wants hardware cache aligned objects then
1492          * follow that suggestion if the object is sufficiently
1493          * large.
1494          *
1495          * The hardware cache alignment cannot override the
1496          * specified alignment though. If that is greater
1497          * then use it.
1498          */
1499         if ((flags & (SLAB_MUST_HWCACHE_ALIGN | SLAB_HWCACHE_ALIGN)) &&
1500                         size > L1_CACHE_BYTES / 2)
1501                 return max_t(unsigned long, align, L1_CACHE_BYTES);
1502
1503         if (align < ARCH_SLAB_MINALIGN)
1504                 return ARCH_SLAB_MINALIGN;
1505
1506         return ALIGN(align, sizeof(void *));
1507 }
1508
1509 static void init_kmem_cache_node(struct kmem_cache_node *n)
1510 {
1511         n->nr_partial = 0;
1512         atomic_long_set(&n->nr_slabs, 0);
1513         spin_lock_init(&n->list_lock);
1514         INIT_LIST_HEAD(&n->partial);
1515         INIT_LIST_HEAD(&n->full);
1516 }
1517
1518 #ifdef CONFIG_NUMA
1519 /*
1520  * No kmalloc_node yet so do it by hand. We know that this is the first
1521  * slab on the node for this slabcache. There are no concurrent accesses
1522  * possible.
1523  *
1524  * Note that this function only works on the kmalloc_node_cache
1525  * when allocating for the kmalloc_node_cache.
1526  */
1527 static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
1528                                                                 int node)
1529 {
1530         struct page *page;
1531         struct kmem_cache_node *n;
1532
1533         BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
1534
1535         page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
1536         /* new_slab() disables interupts */
1537         local_irq_enable();
1538
1539         BUG_ON(!page);
1540         n = page->freelist;
1541         BUG_ON(!n);
1542         page->freelist = get_freepointer(kmalloc_caches, n);
1543         page->inuse++;
1544         kmalloc_caches->node[node] = n;
1545         init_object(kmalloc_caches, n, 1);
1546         init_kmem_cache_node(n);
1547         atomic_long_inc(&n->nr_slabs);
1548         add_partial(n, page);
1549         return n;
1550 }
1551
1552 static void free_kmem_cache_nodes(struct kmem_cache *s)
1553 {
1554         int node;
1555
1556         for_each_online_node(node) {
1557                 struct kmem_cache_node *n = s->node[node];
1558                 if (n && n != &s->local_node)
1559                         kmem_cache_free(kmalloc_caches, n);
1560                 s->node[node] = NULL;
1561         }
1562 }
1563
1564 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1565 {
1566         int node;
1567         int local_node;
1568
1569         if (slab_state >= UP)
1570                 local_node = page_to_nid(virt_to_page(s));
1571         else
1572                 local_node = 0;
1573
1574         for_each_online_node(node) {
1575                 struct kmem_cache_node *n;
1576
1577                 if (local_node == node)
1578                         n = &s->local_node;
1579                 else {
1580                         if (slab_state == DOWN) {
1581                                 n = early_kmem_cache_node_alloc(gfpflags,
1582                                                                 node);
1583                                 continue;
1584                         }
1585                         n = kmem_cache_alloc_node(kmalloc_caches,
1586                                                         gfpflags, node);
1587
1588                         if (!n) {
1589                                 free_kmem_cache_nodes(s);
1590                                 return 0;
1591                         }
1592
1593                 }
1594                 s->node[node] = n;
1595                 init_kmem_cache_node(n);
1596         }
1597         return 1;
1598 }
1599 #else
1600 static void free_kmem_cache_nodes(struct kmem_cache *s)
1601 {
1602 }
1603
1604 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1605 {
1606         init_kmem_cache_node(&s->local_node);
1607         return 1;
1608 }
1609 #endif
1610
1611 /*
1612  * calculate_sizes() determines the order and the distribution of data within
1613  * a slab object.
1614  */
1615 static int calculate_sizes(struct kmem_cache *s)
1616 {
1617         unsigned long flags = s->flags;
1618         unsigned long size = s->objsize;
1619         unsigned long align = s->align;
1620
1621         /*
1622          * Determine if we can poison the object itself. If the user of
1623          * the slab may touch the object after free or before allocation
1624          * then we should never poison the object itself.
1625          */
1626         if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
1627                         !s->ctor && !s->dtor)
1628                 s->flags |= __OBJECT_POISON;
1629         else
1630                 s->flags &= ~__OBJECT_POISON;
1631
1632         /*
1633          * Round up object size to the next word boundary. We can only
1634          * place the free pointer at word boundaries and this determines
1635          * the possible location of the free pointer.
1636          */
1637         size = ALIGN(size, sizeof(void *));
1638
1639         /*
1640          * If we are redzoning then check if there is some space between the
1641          * end of the object and the free pointer. If not then add an
1642          * additional word, so that we can establish a redzone between
1643          * the object and the freepointer to be able to check for overwrites.
1644          */
1645         if ((flags & SLAB_RED_ZONE) && size == s->objsize)
1646                 size += sizeof(void *);
1647
1648         /*
1649          * With that we have determined how much of the slab is in actual
1650          * use by the object. This is the potential offset to the free
1651          * pointer.
1652          */
1653         s->inuse = size;
1654
1655         if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
1656                 s->ctor || s->dtor)) {
1657                 /*
1658                  * Relocate free pointer after the object if it is not
1659                  * permitted to overwrite the first word of the object on
1660                  * kmem_cache_free.
1661                  *
1662                  * This is the case if we do RCU, have a constructor or
1663                  * destructor or are poisoning the objects.
1664                  */
1665                 s->offset = size;
1666                 size += sizeof(void *);
1667         }
1668
1669         if (flags & SLAB_STORE_USER)
1670                 /*
1671                  * Need to store information about allocs and frees after
1672                  * the object.
1673                  */
1674                 size += 2 * sizeof(struct track);
1675
1676         if (flags & DEBUG_DEFAULT_FLAGS)
1677                 /*
1678                  * Add some empty padding so that we can catch
1679                  * overwrites from earlier objects rather than let
1680                  * tracking information or the free pointer be
1681                  * corrupted if an user writes before the start
1682                  * of the object.
1683                  */
1684                 size += sizeof(void *);
1685         /*
1686          * Determine the alignment based on various parameters that the
1687          * user specified (this is unecessarily complex due to the attempt
1688          * to be compatible with SLAB. Should be cleaned up some day).
1689          */
1690         align = calculate_alignment(flags, align, s->objsize);
1691
1692         /*
1693          * SLUB stores one object immediately after another beginning from
1694          * offset 0. In order to align the objects we have to simply size
1695          * each object to conform to the alignment.
1696          */
1697         size = ALIGN(size, align);
1698         s->size = size;
1699
1700         s->order = calculate_order(size);
1701         if (s->order < 0)
1702                 return 0;
1703
1704         /*
1705          * Determine the number of objects per slab
1706          */
1707         s->objects = (PAGE_SIZE << s->order) / size;
1708
1709         /*
1710          * Verify that the number of objects is within permitted limits.
1711          * The page->inuse field is only 16 bit wide! So we cannot have
1712          * more than 64k objects per slab.
1713          */
1714         if (!s->objects || s->objects > 65535)
1715                 return 0;
1716         return 1;
1717
1718 }
1719
1720 static int __init finish_bootstrap(void)
1721 {
1722         struct list_head *h;
1723         int err;
1724
1725         slab_state = SYSFS;
1726
1727         list_for_each(h, &slab_caches) {
1728                 struct kmem_cache *s =
1729                         container_of(h, struct kmem_cache, list);
1730
1731                 err = sysfs_slab_add(s);
1732                 BUG_ON(err);
1733         }
1734         return 0;
1735 }
1736
1737 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
1738                 const char *name, size_t size,
1739                 size_t align, unsigned long flags,
1740                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
1741                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
1742 {
1743         memset(s, 0, kmem_size);
1744         s->name = name;
1745         s->ctor = ctor;
1746         s->dtor = dtor;
1747         s->objsize = size;
1748         s->flags = flags;
1749         s->align = align;
1750
1751         BUG_ON(flags & SLUB_UNIMPLEMENTED);
1752
1753         /*
1754          * The page->offset field is only 16 bit wide. This is an offset
1755          * in units of words from the beginning of an object. If the slab
1756          * size is bigger then we cannot move the free pointer behind the
1757          * object anymore.
1758          *
1759          * On 32 bit platforms the limit is 256k. On 64bit platforms
1760          * the limit is 512k.
1761          *
1762          * Debugging or ctor/dtors may create a need to move the free
1763          * pointer. Fail if this happens.
1764          */
1765         if (s->size >= 65535 * sizeof(void *)) {
1766                 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
1767                                 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
1768                 BUG_ON(ctor || dtor);
1769         }
1770         else
1771                 /*
1772                  * Enable debugging if selected on the kernel commandline.
1773                  */
1774                 if (slub_debug && (!slub_debug_slabs ||
1775                     strncmp(slub_debug_slabs, name,
1776                         strlen(slub_debug_slabs)) == 0))
1777                                 s->flags |= slub_debug;
1778
1779         if (!calculate_sizes(s))
1780                 goto error;
1781
1782         s->refcount = 1;
1783 #ifdef CONFIG_NUMA
1784         s->defrag_ratio = 100;
1785 #endif
1786
1787         if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
1788                 return 1;
1789 error:
1790         if (flags & SLAB_PANIC)
1791                 panic("Cannot create slab %s size=%lu realsize=%u "
1792                         "order=%u offset=%u flags=%lx\n",
1793                         s->name, (unsigned long)size, s->size, s->order,
1794                         s->offset, flags);
1795         return 0;
1796 }
1797 EXPORT_SYMBOL(kmem_cache_open);
1798
1799 /*
1800  * Check if a given pointer is valid
1801  */
1802 int kmem_ptr_validate(struct kmem_cache *s, const void *object)
1803 {
1804         struct page * page;
1805         void *addr;
1806
1807         page = get_object_page(object);
1808
1809         if (!page || s != page->slab)
1810                 /* No slab or wrong slab */
1811                 return 0;
1812
1813         addr = page_address(page);
1814         if (object < addr || object >= addr + s->objects * s->size)
1815                 /* Out of bounds */
1816                 return 0;
1817
1818         if ((object - addr) % s->size)
1819                 /* Improperly aligned */
1820                 return 0;
1821
1822         /*
1823          * We could also check if the object is on the slabs freelist.
1824          * But this would be too expensive and it seems that the main
1825          * purpose of kmem_ptr_valid is to check if the object belongs
1826          * to a certain slab.
1827          */
1828         return 1;
1829 }
1830 EXPORT_SYMBOL(kmem_ptr_validate);
1831
1832 /*
1833  * Determine the size of a slab object
1834  */
1835 unsigned int kmem_cache_size(struct kmem_cache *s)
1836 {
1837         return s->objsize;
1838 }
1839 EXPORT_SYMBOL(kmem_cache_size);
1840
1841 const char *kmem_cache_name(struct kmem_cache *s)
1842 {
1843         return s->name;
1844 }
1845 EXPORT_SYMBOL(kmem_cache_name);
1846
1847 /*
1848  * Attempt to free all slabs on a node
1849  */
1850 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
1851                         struct list_head *list)
1852 {
1853         int slabs_inuse = 0;
1854         unsigned long flags;
1855         struct page *page, *h;
1856
1857         spin_lock_irqsave(&n->list_lock, flags);
1858         list_for_each_entry_safe(page, h, list, lru)
1859                 if (!page->inuse) {
1860                         list_del(&page->lru);
1861                         discard_slab(s, page);
1862                 } else
1863                         slabs_inuse++;
1864         spin_unlock_irqrestore(&n->list_lock, flags);
1865         return slabs_inuse;
1866 }
1867
1868 /*
1869  * Release all resources used by slab cache
1870  */
1871 static int kmem_cache_close(struct kmem_cache *s)
1872 {
1873         int node;
1874
1875         flush_all(s);
1876
1877         /* Attempt to free all objects */
1878         for_each_online_node(node) {
1879                 struct kmem_cache_node *n = get_node(s, node);
1880
1881                 n->nr_partial -= free_list(s, n, &n->partial);
1882                 if (atomic_long_read(&n->nr_slabs))
1883                         return 1;
1884         }
1885         free_kmem_cache_nodes(s);
1886         return 0;
1887 }
1888
1889 /*
1890  * Close a cache and release the kmem_cache structure
1891  * (must be used for caches created using kmem_cache_create)
1892  */
1893 void kmem_cache_destroy(struct kmem_cache *s)
1894 {
1895         down_write(&slub_lock);
1896         s->refcount--;
1897         if (!s->refcount) {
1898                 list_del(&s->list);
1899                 if (kmem_cache_close(s))
1900                         WARN_ON(1);
1901                 sysfs_slab_remove(s);
1902                 kfree(s);
1903         }
1904         up_write(&slub_lock);
1905 }
1906 EXPORT_SYMBOL(kmem_cache_destroy);
1907
1908 /********************************************************************
1909  *              Kmalloc subsystem
1910  *******************************************************************/
1911
1912 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
1913 EXPORT_SYMBOL(kmalloc_caches);
1914
1915 #ifdef CONFIG_ZONE_DMA
1916 static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
1917 #endif
1918
1919 static int __init setup_slub_min_order(char *str)
1920 {
1921         get_option (&str, &slub_min_order);
1922
1923         return 1;
1924 }
1925
1926 __setup("slub_min_order=", setup_slub_min_order);
1927
1928 static int __init setup_slub_max_order(char *str)
1929 {
1930         get_option (&str, &slub_max_order);
1931
1932         return 1;
1933 }
1934
1935 __setup("slub_max_order=", setup_slub_max_order);
1936
1937 static int __init setup_slub_min_objects(char *str)
1938 {
1939         get_option (&str, &slub_min_objects);
1940
1941         return 1;
1942 }
1943
1944 __setup("slub_min_objects=", setup_slub_min_objects);
1945
1946 static int __init setup_slub_nomerge(char *str)
1947 {
1948         slub_nomerge = 1;
1949         return 1;
1950 }
1951
1952 __setup("slub_nomerge", setup_slub_nomerge);
1953
1954 static int __init setup_slub_debug(char *str)
1955 {
1956         if (!str || *str != '=')
1957                 slub_debug = DEBUG_DEFAULT_FLAGS;
1958         else {
1959                 str++;
1960                 if (*str == 0 || *str == ',')
1961                         slub_debug = DEBUG_DEFAULT_FLAGS;
1962                 else
1963                 for( ;*str && *str != ','; str++)
1964                         switch (*str) {
1965                         case 'f' : case 'F' :
1966                                 slub_debug |= SLAB_DEBUG_FREE;
1967                                 break;
1968                         case 'z' : case 'Z' :
1969                                 slub_debug |= SLAB_RED_ZONE;
1970                                 break;
1971                         case 'p' : case 'P' :
1972                                 slub_debug |= SLAB_POISON;
1973                                 break;
1974                         case 'u' : case 'U' :
1975                                 slub_debug |= SLAB_STORE_USER;
1976                                 break;
1977                         case 't' : case 'T' :
1978                                 slub_debug |= SLAB_TRACE;
1979                                 break;
1980                         default:
1981                                 printk(KERN_ERR "slub_debug option '%c' "
1982                                         "unknown. skipped\n",*str);
1983                         }
1984         }
1985
1986         if (*str == ',')
1987                 slub_debug_slabs = str + 1;
1988         return 1;
1989 }
1990
1991 __setup("slub_debug", setup_slub_debug);
1992
1993 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
1994                 const char *name, int size, gfp_t gfp_flags)
1995 {
1996         unsigned int flags = 0;
1997
1998         if (gfp_flags & SLUB_DMA)
1999                 flags = SLAB_CACHE_DMA;
2000
2001         down_write(&slub_lock);
2002         if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2003                         flags, NULL, NULL))
2004                 goto panic;
2005
2006         list_add(&s->list, &slab_caches);
2007         up_write(&slub_lock);
2008         if (sysfs_slab_add(s))
2009                 goto panic;
2010         return s;
2011
2012 panic:
2013         panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2014 }
2015
2016 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2017 {
2018         int index = kmalloc_index(size);
2019
2020         if (!index)
2021                 return NULL;
2022
2023         /* Allocation too large? */
2024         BUG_ON(index < 0);
2025
2026 #ifdef CONFIG_ZONE_DMA
2027         if ((flags & SLUB_DMA)) {
2028                 struct kmem_cache *s;
2029                 struct kmem_cache *x;
2030                 char *text;
2031                 size_t realsize;
2032
2033                 s = kmalloc_caches_dma[index];
2034                 if (s)
2035                         return s;
2036
2037                 /* Dynamically create dma cache */
2038                 x = kmalloc(kmem_size, flags & ~SLUB_DMA);
2039                 if (!x)
2040                         panic("Unable to allocate memory for dma cache\n");
2041
2042                 if (index <= KMALLOC_SHIFT_HIGH)
2043                         realsize = 1 << index;
2044                 else {
2045                         if (index == 1)
2046                                 realsize = 96;
2047                         else
2048                                 realsize = 192;
2049                 }
2050
2051                 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2052                                 (unsigned int)realsize);
2053                 s = create_kmalloc_cache(x, text, realsize, flags);
2054                 kmalloc_caches_dma[index] = s;
2055                 return s;
2056         }
2057 #endif
2058         return &kmalloc_caches[index];
2059 }
2060
2061 void *__kmalloc(size_t size, gfp_t flags)
2062 {
2063         struct kmem_cache *s = get_slab(size, flags);
2064
2065         if (s)
2066                 return slab_alloc(s, flags, -1, __builtin_return_address(0));
2067         return NULL;
2068 }
2069 EXPORT_SYMBOL(__kmalloc);
2070
2071 #ifdef CONFIG_NUMA
2072 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2073 {
2074         struct kmem_cache *s = get_slab(size, flags);
2075
2076         if (s)
2077                 return slab_alloc(s, flags, node, __builtin_return_address(0));
2078         return NULL;
2079 }
2080 EXPORT_SYMBOL(__kmalloc_node);
2081 #endif
2082
2083 size_t ksize(const void *object)
2084 {
2085         struct page *page = get_object_page(object);
2086         struct kmem_cache *s;
2087
2088         BUG_ON(!page);
2089         s = page->slab;
2090         BUG_ON(!s);
2091
2092         /*
2093          * Debugging requires use of the padding between object
2094          * and whatever may come after it.
2095          */
2096         if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2097                 return s->objsize;
2098
2099         /*
2100          * If we have the need to store the freelist pointer
2101          * back there or track user information then we can
2102          * only use the space before that information.
2103          */
2104         if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2105                 return s->inuse;
2106
2107         /*
2108          * Else we can use all the padding etc for the allocation
2109          */
2110         return s->size;
2111 }
2112 EXPORT_SYMBOL(ksize);
2113
2114 void kfree(const void *x)
2115 {
2116         struct kmem_cache *s;
2117         struct page *page;
2118
2119         if (!x)
2120                 return;
2121
2122         page = virt_to_head_page(x);
2123         s = page->slab;
2124
2125         slab_free(s, page, (void *)x, __builtin_return_address(0));
2126 }
2127 EXPORT_SYMBOL(kfree);
2128
2129 /*
2130  *  kmem_cache_shrink removes empty slabs from the partial lists
2131  *  and then sorts the partially allocated slabs by the number
2132  *  of items in use. The slabs with the most items in use
2133  *  come first. New allocations will remove these from the
2134  *  partial list because they are full. The slabs with the
2135  *  least items are placed last. If it happens that the objects
2136  *  are freed then the page can be returned to the page allocator.
2137  */
2138 int kmem_cache_shrink(struct kmem_cache *s)
2139 {
2140         int node;
2141         int i;
2142         struct kmem_cache_node *n;
2143         struct page *page;
2144         struct page *t;
2145         struct list_head *slabs_by_inuse =
2146                 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
2147         unsigned long flags;
2148
2149         if (!slabs_by_inuse)
2150                 return -ENOMEM;
2151
2152         flush_all(s);
2153         for_each_online_node(node) {
2154                 n = get_node(s, node);
2155
2156                 if (!n->nr_partial)
2157                         continue;
2158
2159                 for (i = 0; i < s->objects; i++)
2160                         INIT_LIST_HEAD(slabs_by_inuse + i);
2161
2162                 spin_lock_irqsave(&n->list_lock, flags);
2163
2164                 /*
2165                  * Build lists indexed by the items in use in
2166                  * each slab or free slabs if empty.
2167                  *
2168                  * Note that concurrent frees may occur while
2169                  * we hold the list_lock. page->inuse here is
2170                  * the upper limit.
2171                  */
2172                 list_for_each_entry_safe(page, t, &n->partial, lru) {
2173                         if (!page->inuse && slab_trylock(page)) {
2174                                 /*
2175                                  * Must hold slab lock here because slab_free
2176                                  * may have freed the last object and be
2177                                  * waiting to release the slab.
2178                                  */
2179                                 list_del(&page->lru);
2180                                 n->nr_partial--;
2181                                 slab_unlock(page);
2182                                 discard_slab(s, page);
2183                         } else {
2184                                 if (n->nr_partial > MAX_PARTIAL)
2185                                         list_move(&page->lru,
2186                                         slabs_by_inuse + page->inuse);
2187                         }
2188                 }
2189
2190                 if (n->nr_partial <= MAX_PARTIAL)
2191                         goto out;
2192
2193                 /*
2194                  * Rebuild the partial list with the slabs filled up
2195                  * most first and the least used slabs at the end.
2196                  */
2197                 for (i = s->objects - 1; i >= 0; i--)
2198                         list_splice(slabs_by_inuse + i, n->partial.prev);
2199
2200         out:
2201                 spin_unlock_irqrestore(&n->list_lock, flags);
2202         }
2203
2204         kfree(slabs_by_inuse);
2205         return 0;
2206 }
2207 EXPORT_SYMBOL(kmem_cache_shrink);
2208
2209 /**
2210  * krealloc - reallocate memory. The contents will remain unchanged.
2211  *
2212  * @p: object to reallocate memory for.
2213  * @new_size: how many bytes of memory are required.
2214  * @flags: the type of memory to allocate.
2215  *
2216  * The contents of the object pointed to are preserved up to the
2217  * lesser of the new and old sizes.  If @p is %NULL, krealloc()
2218  * behaves exactly like kmalloc().  If @size is 0 and @p is not a
2219  * %NULL pointer, the object pointed to is freed.
2220  */
2221 void *krealloc(const void *p, size_t new_size, gfp_t flags)
2222 {
2223         struct kmem_cache *new_cache;
2224         void *ret;
2225         struct page *page;
2226
2227         if (unlikely(!p))
2228                 return kmalloc(new_size, flags);
2229
2230         if (unlikely(!new_size)) {
2231                 kfree(p);
2232                 return NULL;
2233         }
2234
2235         page = virt_to_head_page(p);
2236
2237         new_cache = get_slab(new_size, flags);
2238
2239         /*
2240          * If new size fits in the current cache, bail out.
2241          */
2242         if (likely(page->slab == new_cache))
2243                 return (void *)p;
2244
2245         ret = kmalloc(new_size, flags);
2246         if (ret) {
2247                 memcpy(ret, p, min(new_size, ksize(p)));
2248                 kfree(p);
2249         }
2250         return ret;
2251 }
2252 EXPORT_SYMBOL(krealloc);
2253
2254 /********************************************************************
2255  *                      Basic setup of slabs
2256  *******************************************************************/
2257
2258 void __init kmem_cache_init(void)
2259 {
2260         int i;
2261
2262 #ifdef CONFIG_NUMA
2263         /*
2264          * Must first have the slab cache available for the allocations of the
2265          * struct kmalloc_cache_node's. There is special bootstrap code in
2266          * kmem_cache_open for slab_state == DOWN.
2267          */
2268         create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2269                 sizeof(struct kmem_cache_node), GFP_KERNEL);
2270 #endif
2271
2272         /* Able to allocate the per node structures */
2273         slab_state = PARTIAL;
2274
2275         /* Caches that are not of the two-to-the-power-of size */
2276         create_kmalloc_cache(&kmalloc_caches[1],
2277                                 "kmalloc-96", 96, GFP_KERNEL);
2278         create_kmalloc_cache(&kmalloc_caches[2],
2279                                 "kmalloc-192", 192, GFP_KERNEL);
2280
2281         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2282                 create_kmalloc_cache(&kmalloc_caches[i],
2283                         "kmalloc", 1 << i, GFP_KERNEL);
2284
2285         slab_state = UP;
2286
2287         /* Provide the correct kmalloc names now that the caches are up */
2288         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
2289                 kmalloc_caches[i]. name =
2290                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2291
2292 #ifdef CONFIG_SMP
2293         register_cpu_notifier(&slab_notifier);
2294 #endif
2295
2296         if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
2297                 kmem_size = offsetof(struct kmem_cache, cpu_slab)
2298                          + nr_cpu_ids * sizeof(struct page *);
2299
2300         printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2301                 " Processors=%d, Nodes=%d\n",
2302                 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
2303                 slub_min_order, slub_max_order, slub_min_objects,
2304                 nr_cpu_ids, nr_node_ids);
2305 }
2306
2307 /*
2308  * Find a mergeable slab cache
2309  */
2310 static int slab_unmergeable(struct kmem_cache *s)
2311 {
2312         if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
2313                 return 1;
2314
2315         if (s->ctor || s->dtor)
2316                 return 1;
2317
2318         return 0;
2319 }
2320
2321 static struct kmem_cache *find_mergeable(size_t size,
2322                 size_t align, unsigned long flags,
2323                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2324                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2325 {
2326         struct list_head *h;
2327
2328         if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
2329                 return NULL;
2330
2331         if (ctor || dtor)
2332                 return NULL;
2333
2334         size = ALIGN(size, sizeof(void *));
2335         align = calculate_alignment(flags, align, size);
2336         size = ALIGN(size, align);
2337
2338         list_for_each(h, &slab_caches) {
2339                 struct kmem_cache *s =
2340                         container_of(h, struct kmem_cache, list);
2341
2342                 if (slab_unmergeable(s))
2343                         continue;
2344
2345                 if (size > s->size)
2346                         continue;
2347
2348                 if (((flags | slub_debug) & SLUB_MERGE_SAME) !=
2349                         (s->flags & SLUB_MERGE_SAME))
2350                                 continue;
2351                 /*
2352                  * Check if alignment is compatible.
2353                  * Courtesy of Adrian Drzewiecki
2354                  */
2355                 if ((s->size & ~(align -1)) != s->size)
2356                         continue;
2357
2358                 if (s->size - size >= sizeof(void *))
2359                         continue;
2360
2361                 return s;
2362         }
2363         return NULL;
2364 }
2365
2366 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2367                 size_t align, unsigned long flags,
2368                 void (*ctor)(void *, struct kmem_cache *, unsigned long),
2369                 void (*dtor)(void *, struct kmem_cache *, unsigned long))
2370 {
2371         struct kmem_cache *s;
2372
2373         down_write(&slub_lock);
2374         s = find_mergeable(size, align, flags, dtor, ctor);
2375         if (s) {
2376                 s->refcount++;
2377                 /*
2378                  * Adjust the object sizes so that we clear
2379                  * the complete object on kzalloc.
2380                  */
2381                 s->objsize = max(s->objsize, (int)size);
2382                 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2383                 if (sysfs_slab_alias(s, name))
2384                         goto err;
2385         } else {
2386                 s = kmalloc(kmem_size, GFP_KERNEL);
2387                 if (s && kmem_cache_open(s, GFP_KERNEL, name,
2388                                 size, align, flags, ctor, dtor)) {
2389                         if (sysfs_slab_add(s)) {
2390                                 kfree(s);
2391                                 goto err;
2392                         }
2393                         list_add(&s->list, &slab_caches);
2394                 } else
2395                         kfree(s);
2396         }
2397         up_write(&slub_lock);
2398         return s;
2399
2400 err:
2401         up_write(&slub_lock);
2402         if (flags & SLAB_PANIC)
2403                 panic("Cannot create slabcache %s\n", name);
2404         else
2405                 s = NULL;
2406         return s;
2407 }
2408 EXPORT_SYMBOL(kmem_cache_create);
2409
2410 void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
2411 {
2412         void *x;
2413
2414         x = slab_alloc(s, flags, -1, __builtin_return_address(0));
2415         if (x)
2416                 memset(x, 0, s->objsize);
2417         return x;
2418 }
2419 EXPORT_SYMBOL(kmem_cache_zalloc);
2420
2421 #ifdef CONFIG_SMP
2422 static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
2423 {
2424         struct list_head *h;
2425
2426         down_read(&slub_lock);
2427         list_for_each(h, &slab_caches) {
2428                 struct kmem_cache *s =
2429                         container_of(h, struct kmem_cache, list);
2430
2431                 func(s, cpu);
2432         }
2433         up_read(&slub_lock);
2434 }
2435
2436 /*
2437  * Use the cpu notifier to insure that the slab are flushed
2438  * when necessary.
2439  */
2440 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2441                 unsigned long action, void *hcpu)
2442 {
2443         long cpu = (long)hcpu;
2444
2445         switch (action) {
2446         case CPU_UP_CANCELED:
2447         case CPU_DEAD:
2448                 for_all_slabs(__flush_cpu_slab, cpu);
2449                 break;
2450         default:
2451                 break;
2452         }
2453         return NOTIFY_OK;
2454 }
2455
2456 static struct notifier_block __cpuinitdata slab_notifier =
2457         { &slab_cpuup_callback, NULL, 0 };
2458
2459 #endif
2460
2461 #ifdef CONFIG_NUMA
2462
2463 /*****************************************************************
2464  * Generic reaper used to support the page allocator
2465  * (the cpu slabs are reaped by a per slab workqueue).
2466  *
2467  * Maybe move this to the page allocator?
2468  ****************************************************************/
2469
2470 static DEFINE_PER_CPU(unsigned long, reap_node);
2471
2472 static void init_reap_node(int cpu)
2473 {
2474         int node;
2475
2476         node = next_node(cpu_to_node(cpu), node_online_map);
2477         if (node == MAX_NUMNODES)
2478                 node = first_node(node_online_map);
2479
2480         __get_cpu_var(reap_node) = node;
2481 }
2482
2483 static void next_reap_node(void)
2484 {
2485         int node = __get_cpu_var(reap_node);
2486
2487         /*
2488          * Also drain per cpu pages on remote zones
2489          */
2490         if (node != numa_node_id())
2491                 drain_node_pages(node);
2492
2493         node = next_node(node, node_online_map);
2494         if (unlikely(node >= MAX_NUMNODES))
2495                 node = first_node(node_online_map);
2496         __get_cpu_var(reap_node) = node;
2497 }
2498 #else
2499 #define init_reap_node(cpu) do { } while (0)
2500 #define next_reap_node(void) do { } while (0)
2501 #endif
2502
2503 #define REAPTIMEOUT_CPUC        (2*HZ)
2504
2505 #ifdef CONFIG_SMP
2506 static DEFINE_PER_CPU(struct delayed_work, reap_work);
2507
2508 static void cache_reap(struct work_struct *unused)
2509 {
2510         next_reap_node();
2511         refresh_cpu_vm_stats(smp_processor_id());
2512         schedule_delayed_work(&__get_cpu_var(reap_work),
2513                                       REAPTIMEOUT_CPUC);
2514 }
2515
2516 static void __devinit start_cpu_timer(int cpu)
2517 {
2518         struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2519
2520         /*
2521          * When this gets called from do_initcalls via cpucache_init(),
2522          * init_workqueues() has already run, so keventd will be setup
2523          * at that time.
2524          */
2525         if (keventd_up() && reap_work->work.func == NULL) {
2526                 init_reap_node(cpu);
2527                 INIT_DELAYED_WORK(reap_work, cache_reap);
2528                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2529         }
2530 }
2531
2532 static int __init cpucache_init(void)
2533 {
2534         int cpu;
2535
2536         /*
2537          * Register the timers that drain pcp pages and update vm statistics
2538          */
2539         for_each_online_cpu(cpu)
2540                 start_cpu_timer(cpu);
2541         return 0;
2542 }
2543 __initcall(cpucache_init);
2544 #endif
2545
2546 #ifdef SLUB_RESILIENCY_TEST
2547 static unsigned long validate_slab_cache(struct kmem_cache *s);
2548
2549 static void resiliency_test(void)
2550 {
2551         u8 *p;
2552
2553         printk(KERN_ERR "SLUB resiliency testing\n");
2554         printk(KERN_ERR "-----------------------\n");
2555         printk(KERN_ERR "A. Corruption after allocation\n");
2556
2557         p = kzalloc(16, GFP_KERNEL);
2558         p[16] = 0x12;
2559         printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
2560                         " 0x12->0x%p\n\n", p + 16);
2561
2562         validate_slab_cache(kmalloc_caches + 4);
2563
2564         /* Hmmm... The next two are dangerous */
2565         p = kzalloc(32, GFP_KERNEL);
2566         p[32 + sizeof(void *)] = 0x34;
2567         printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
2568                         " 0x34 -> -0x%p\n", p);
2569         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2570
2571         validate_slab_cache(kmalloc_caches + 5);
2572         p = kzalloc(64, GFP_KERNEL);
2573         p += 64 + (get_cycles() & 0xff) * sizeof(void *);
2574         *p = 0x56;
2575         printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
2576                                                                         p);
2577         printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
2578         validate_slab_cache(kmalloc_caches + 6);
2579
2580         printk(KERN_ERR "\nB. Corruption after free\n");
2581         p = kzalloc(128, GFP_KERNEL);
2582         kfree(p);
2583         *p = 0x78;
2584         printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
2585         validate_slab_cache(kmalloc_caches + 7);
2586
2587         p = kzalloc(256, GFP_KERNEL);
2588         kfree(p);
2589         p[50] = 0x9a;
2590         printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
2591         validate_slab_cache(kmalloc_caches + 8);
2592
2593         p = kzalloc(512, GFP_KERNEL);
2594         kfree(p);
2595         p[512] = 0xab;
2596         printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
2597         validate_slab_cache(kmalloc_caches + 9);
2598 }
2599 #else
2600 static void resiliency_test(void) {};
2601 #endif
2602
2603 /*
2604  * These are not as efficient as kmalloc for the non debug case.
2605  * We do not have the page struct available so we have to touch one
2606  * cacheline in struct kmem_cache to check slab flags.
2607  */
2608 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2609 {
2610         struct kmem_cache *s = get_slab(size, gfpflags);
2611
2612         if (!s)
2613                 return NULL;
2614
2615         return slab_alloc(s, gfpflags, -1, caller);
2616 }
2617
2618 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2619                                         int node, void *caller)
2620 {
2621         struct kmem_cache *s = get_slab(size, gfpflags);
2622
2623         if (!s)
2624                 return NULL;
2625
2626         return slab_alloc(s, gfpflags, node, caller);
2627 }
2628
2629 #ifdef CONFIG_SYSFS
2630
2631 static int validate_slab(struct kmem_cache *s, struct page *page)
2632 {
2633         void *p;
2634         void *addr = page_address(page);
2635         unsigned long map[BITS_TO_LONGS(s->objects)];
2636
2637         if (!check_slab(s, page) ||
2638                         !on_freelist(s, page, NULL))
2639                 return 0;
2640
2641         /* Now we know that a valid freelist exists */
2642         bitmap_zero(map, s->objects);
2643
2644         for(p = page->freelist; p; p = get_freepointer(s, p)) {
2645                 set_bit((p - addr) / s->size, map);
2646                 if (!check_object(s, page, p, 0))
2647                         return 0;
2648         }
2649
2650         for(p = addr; p < addr + s->objects * s->size; p += s->size)
2651                 if (!test_bit((p - addr) / s->size, map))
2652                         if (!check_object(s, page, p, 1))
2653                                 return 0;
2654         return 1;
2655 }
2656
2657 static void validate_slab_slab(struct kmem_cache *s, struct page *page)
2658 {
2659         if (slab_trylock(page)) {
2660                 validate_slab(s, page);
2661                 slab_unlock(page);
2662         } else
2663                 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
2664                         s->name, page);
2665
2666         if (s->flags & DEBUG_DEFAULT_FLAGS) {
2667                 if (!PageError(page))
2668                         printk(KERN_ERR "SLUB %s: PageError not set "
2669                                 "on slab 0x%p\n", s->name, page);
2670         } else {
2671                 if (PageError(page))
2672                         printk(KERN_ERR "SLUB %s: PageError set on "
2673                                 "slab 0x%p\n", s->name, page);
2674         }
2675 }
2676
2677 static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
2678 {
2679         unsigned long count = 0;
2680         struct page *page;
2681         unsigned long flags;
2682
2683         spin_lock_irqsave(&n->list_lock, flags);
2684
2685         list_for_each_entry(page, &n->partial, lru) {
2686                 validate_slab_slab(s, page);
2687                 count++;
2688         }
2689         if (count != n->nr_partial)
2690                 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
2691                         "counter=%ld\n", s->name, count, n->nr_partial);
2692
2693         if (!(s->flags & SLAB_STORE_USER))
2694                 goto out;
2695
2696         list_for_each_entry(page, &n->full, lru) {
2697                 validate_slab_slab(s, page);
2698                 count++;
2699         }
2700         if (count != atomic_long_read(&n->nr_slabs))
2701                 printk(KERN_ERR "SLUB: %s %ld slabs counted but "
2702                         "counter=%ld\n", s->name, count,
2703                         atomic_long_read(&n->nr_slabs));
2704
2705 out:
2706         spin_unlock_irqrestore(&n->list_lock, flags);
2707         return count;
2708 }
2709
2710 static unsigned long validate_slab_cache(struct kmem_cache *s)
2711 {
2712         int node;
2713         unsigned long count = 0;
2714
2715         flush_all(s);
2716         for_each_online_node(node) {
2717                 struct kmem_cache_node *n = get_node(s, node);
2718
2719                 count += validate_slab_node(s, n);
2720         }
2721         return count;
2722 }
2723
2724 /*
2725  * Generate lists of locations where slabcache objects are allocated
2726  * and freed.
2727  */
2728
2729 struct location {
2730         unsigned long count;
2731         void *addr;
2732 };
2733
2734 struct loc_track {
2735         unsigned long max;
2736         unsigned long count;
2737         struct location *loc;
2738 };
2739
2740 static void free_loc_track(struct loc_track *t)
2741 {
2742         if (t->max)
2743                 free_pages((unsigned long)t->loc,
2744                         get_order(sizeof(struct location) * t->max));
2745 }
2746
2747 static int alloc_loc_track(struct loc_track *t, unsigned long max)
2748 {
2749         struct location *l;
2750         int order;
2751
2752         if (!max)
2753                 max = PAGE_SIZE / sizeof(struct location);
2754
2755         order = get_order(sizeof(struct location) * max);
2756
2757         l = (void *)__get_free_pages(GFP_KERNEL, order);
2758
2759         if (!l)
2760                 return 0;
2761
2762         if (t->count) {
2763                 memcpy(l, t->loc, sizeof(struct location) * t->count);
2764                 free_loc_track(t);
2765         }
2766         t->max = max;
2767         t->loc = l;
2768         return 1;
2769 }
2770
2771 static int add_location(struct loc_track *t, struct kmem_cache *s,
2772                                                 void *addr)
2773 {
2774         long start, end, pos;
2775         struct location *l;
2776         void *caddr;
2777
2778         start = -1;
2779         end = t->count;
2780
2781         for ( ; ; ) {
2782                 pos = start + (end - start + 1) / 2;
2783
2784                 /*
2785                  * There is nothing at "end". If we end up there
2786                  * we need to add something to before end.
2787                  */
2788                 if (pos == end)
2789                         break;
2790
2791                 caddr = t->loc[pos].addr;
2792                 if (addr == caddr) {
2793                         t->loc[pos].count++;
2794                         return 1;
2795                 }
2796
2797                 if (addr < caddr)
2798                         end = pos;
2799                 else
2800                         start = pos;
2801         }
2802
2803         /*
2804          * Not found. Insert new tracking element
2805          */
2806         if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
2807                 return 0;
2808
2809         l = t->loc + pos;
2810         if (pos < t->count)
2811                 memmove(l + 1, l,
2812                         (t->count - pos) * sizeof(struct location));
2813         t->count++;
2814         l->count = 1;
2815         l->addr = addr;
2816         return 1;
2817 }
2818
2819 static void process_slab(struct loc_track *t, struct kmem_cache *s,
2820                 struct page *page, enum track_item alloc)
2821 {
2822         void *addr = page_address(page);
2823         unsigned long map[BITS_TO_LONGS(s->objects)];
2824         void *p;
2825
2826         bitmap_zero(map, s->objects);
2827         for (p = page->freelist; p; p = get_freepointer(s, p))
2828                 set_bit((p - addr) / s->size, map);
2829
2830         for (p = addr; p < addr + s->objects * s->size; p += s->size)
2831                 if (!test_bit((p - addr) / s->size, map)) {
2832                         void *addr = get_track(s, p, alloc)->addr;
2833
2834                         add_location(t, s, addr);
2835                 }
2836 }
2837
2838 static int list_locations(struct kmem_cache *s, char *buf,
2839                                         enum track_item alloc)
2840 {
2841         int n = 0;
2842         unsigned long i;
2843         struct loc_track t;
2844         int node;
2845
2846         t.count = 0;
2847         t.max = 0;
2848
2849         /* Push back cpu slabs */
2850         flush_all(s);
2851
2852         for_each_online_node(node) {
2853                 struct kmem_cache_node *n = get_node(s, node);
2854                 unsigned long flags;
2855                 struct page *page;
2856
2857                 if (!atomic_read(&n->nr_slabs))
2858                         continue;
2859
2860                 spin_lock_irqsave(&n->list_lock, flags);
2861                 list_for_each_entry(page, &n->partial, lru)
2862                         process_slab(&t, s, page, alloc);
2863                 list_for_each_entry(page, &n->full, lru)
2864                         process_slab(&t, s, page, alloc);
2865                 spin_unlock_irqrestore(&n->list_lock, flags);
2866         }
2867
2868         for (i = 0; i < t.count; i++) {
2869                 void *addr = t.loc[i].addr;
2870
2871                 if (n > PAGE_SIZE - 100)
2872                         break;
2873                 n += sprintf(buf + n, "%7ld ", t.loc[i].count);
2874                 if (addr)
2875                         n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
2876                 else
2877                         n += sprintf(buf + n, "<not-available>");
2878                 n += sprintf(buf + n, "\n");
2879         }
2880
2881         free_loc_track(&t);
2882         if (!t.count)
2883                 n += sprintf(buf, "No data\n");
2884         return n;
2885 }
2886
2887 static unsigned long count_partial(struct kmem_cache_node *n)
2888 {
2889         unsigned long flags;
2890         unsigned long x = 0;
2891         struct page *page;
2892
2893         spin_lock_irqsave(&n->list_lock, flags);
2894         list_for_each_entry(page, &n->partial, lru)
2895                 x += page->inuse;
2896         spin_unlock_irqrestore(&n->list_lock, flags);
2897         return x;
2898 }
2899
2900 enum slab_stat_type {
2901         SL_FULL,
2902         SL_PARTIAL,
2903         SL_CPU,
2904         SL_OBJECTS
2905 };
2906
2907 #define SO_FULL         (1 << SL_FULL)
2908 #define SO_PARTIAL      (1 << SL_PARTIAL)
2909 #define SO_CPU          (1 << SL_CPU)
2910 #define SO_OBJECTS      (1 << SL_OBJECTS)
2911
2912 static unsigned long slab_objects(struct kmem_cache *s,
2913                         char *buf, unsigned long flags)
2914 {
2915         unsigned long total = 0;
2916         int cpu;
2917         int node;
2918         int x;
2919         unsigned long *nodes;
2920         unsigned long *per_cpu;
2921
2922         nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
2923         per_cpu = nodes + nr_node_ids;
2924
2925         for_each_possible_cpu(cpu) {
2926                 struct page *page = s->cpu_slab[cpu];
2927                 int node;
2928
2929                 if (page) {
2930                         node = page_to_nid(page);
2931                         if (flags & SO_CPU) {
2932                                 int x = 0;
2933
2934                                 if (flags & SO_OBJECTS)
2935                                         x = page->inuse;
2936                                 else
2937                                         x = 1;
2938                                 total += x;
2939                                 nodes[node] += x;
2940                         }
2941                         per_cpu[node]++;
2942                 }
2943         }
2944
2945         for_each_online_node(node) {
2946                 struct kmem_cache_node *n = get_node(s, node);
2947
2948                 if (flags & SO_PARTIAL) {
2949                         if (flags & SO_OBJECTS)
2950                                 x = count_partial(n);
2951                         else
2952                                 x = n->nr_partial;
2953                         total += x;
2954                         nodes[node] += x;
2955                 }
2956
2957                 if (flags & SO_FULL) {
2958                         int full_slabs = atomic_read(&n->nr_slabs)
2959                                         - per_cpu[node]
2960                                         - n->nr_partial;
2961
2962                         if (flags & SO_OBJECTS)
2963                                 x = full_slabs * s->objects;
2964                         else
2965                                 x = full_slabs;
2966                         total += x;
2967                         nodes[node] += x;
2968                 }
2969         }
2970
2971         x = sprintf(buf, "%lu", total);
2972 #ifdef CONFIG_NUMA
2973         for_each_online_node(node)
2974                 if (nodes[node])
2975                         x += sprintf(buf + x, " N%d=%lu",
2976                                         node, nodes[node]);
2977 #endif
2978         kfree(nodes);
2979         return x + sprintf(buf + x, "\n");
2980 }
2981
2982 static int any_slab_objects(struct kmem_cache *s)
2983 {
2984         int node;
2985         int cpu;
2986
2987         for_each_possible_cpu(cpu)
2988                 if (s->cpu_slab[cpu])
2989                         return 1;
2990
2991         for_each_node(node) {
2992                 struct kmem_cache_node *n = get_node(s, node);
2993
2994                 if (n->nr_partial || atomic_read(&n->nr_slabs))
2995                         return 1;
2996         }
2997         return 0;
2998 }
2999
3000 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3001 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
3002
3003 struct slab_attribute {
3004         struct attribute attr;
3005         ssize_t (*show)(struct kmem_cache *s, char *buf);
3006         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3007 };
3008
3009 #define SLAB_ATTR_RO(_name) \
3010         static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3011
3012 #define SLAB_ATTR(_name) \
3013         static struct slab_attribute _name##_attr =  \
3014         __ATTR(_name, 0644, _name##_show, _name##_store)
3015
3016 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3017 {
3018         return sprintf(buf, "%d\n", s->size);
3019 }
3020 SLAB_ATTR_RO(slab_size);
3021
3022 static ssize_t align_show(struct kmem_cache *s, char *buf)
3023 {
3024         return sprintf(buf, "%d\n", s->align);
3025 }
3026 SLAB_ATTR_RO(align);
3027
3028 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3029 {
3030         return sprintf(buf, "%d\n", s->objsize);
3031 }
3032 SLAB_ATTR_RO(object_size);
3033
3034 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3035 {
3036         return sprintf(buf, "%d\n", s->objects);
3037 }
3038 SLAB_ATTR_RO(objs_per_slab);
3039
3040 static ssize_t order_show(struct kmem_cache *s, char *buf)
3041 {
3042         return sprintf(buf, "%d\n", s->order);
3043 }
3044 SLAB_ATTR_RO(order);
3045
3046 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3047 {
3048         if (s->ctor) {
3049                 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3050
3051                 return n + sprintf(buf + n, "\n");
3052         }
3053         return 0;
3054 }
3055 SLAB_ATTR_RO(ctor);
3056
3057 static ssize_t dtor_show(struct kmem_cache *s, char *buf)
3058 {
3059         if (s->dtor) {
3060                 int n = sprint_symbol(buf, (unsigned long)s->dtor);
3061
3062                 return n + sprintf(buf + n, "\n");
3063         }
3064         return 0;
3065 }
3066 SLAB_ATTR_RO(dtor);
3067
3068 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3069 {
3070         return sprintf(buf, "%d\n", s->refcount - 1);
3071 }
3072 SLAB_ATTR_RO(aliases);
3073
3074 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3075 {
3076         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
3077 }
3078 SLAB_ATTR_RO(slabs);
3079
3080 static ssize_t partial_show(struct kmem_cache *s, char *buf)
3081 {
3082         return slab_objects(s, buf, SO_PARTIAL);
3083 }
3084 SLAB_ATTR_RO(partial);
3085
3086 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
3087 {
3088         return slab_objects(s, buf, SO_CPU);
3089 }
3090 SLAB_ATTR_RO(cpu_slabs);
3091
3092 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3093 {
3094         return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
3095 }
3096 SLAB_ATTR_RO(objects);
3097
3098 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3099 {
3100         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
3101 }
3102
3103 static ssize_t sanity_checks_store(struct kmem_cache *s,
3104                                 const char *buf, size_t length)
3105 {
3106         s->flags &= ~SLAB_DEBUG_FREE;
3107         if (buf[0] == '1')
3108                 s->flags |= SLAB_DEBUG_FREE;
3109         return length;
3110 }
3111 SLAB_ATTR(sanity_checks);
3112
3113 static ssize_t trace_show(struct kmem_cache *s, char *buf)
3114 {
3115         return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
3116 }
3117
3118 static ssize_t trace_store(struct kmem_cache *s, const char *buf,
3119                                                         size_t length)
3120 {
3121         s->flags &= ~SLAB_TRACE;
3122         if (buf[0] == '1')
3123                 s->flags |= SLAB_TRACE;
3124         return length;
3125 }
3126 SLAB_ATTR(trace);
3127
3128 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3129 {
3130         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3131 }
3132
3133 static ssize_t reclaim_account_store(struct kmem_cache *s,
3134                                 const char *buf, size_t length)
3135 {
3136         s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3137         if (buf[0] == '1')
3138                 s->flags |= SLAB_RECLAIM_ACCOUNT;
3139         return length;
3140 }
3141 SLAB_ATTR(reclaim_account);
3142
3143 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3144 {
3145         return sprintf(buf, "%d\n", !!(s->flags &
3146                 (SLAB_HWCACHE_ALIGN|SLAB_MUST_HWCACHE_ALIGN)));
3147 }
3148 SLAB_ATTR_RO(hwcache_align);
3149
3150 #ifdef CONFIG_ZONE_DMA
3151 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3152 {
3153         return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3154 }
3155 SLAB_ATTR_RO(cache_dma);
3156 #endif
3157
3158 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3159 {
3160         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3161 }
3162 SLAB_ATTR_RO(destroy_by_rcu);
3163
3164 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3165 {
3166         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3167 }
3168
3169 static ssize_t red_zone_store(struct kmem_cache *s,
3170                                 const char *buf, size_t length)
3171 {
3172         if (any_slab_objects(s))
3173                 return -EBUSY;
3174
3175         s->flags &= ~SLAB_RED_ZONE;
3176         if (buf[0] == '1')
3177                 s->flags |= SLAB_RED_ZONE;
3178         calculate_sizes(s);
3179         return length;
3180 }
3181 SLAB_ATTR(red_zone);
3182
3183 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3184 {
3185         return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3186 }
3187
3188 static ssize_t poison_store(struct kmem_cache *s,
3189                                 const char *buf, size_t length)
3190 {
3191         if (any_slab_objects(s))
3192                 return -EBUSY;
3193
3194         s->flags &= ~SLAB_POISON;
3195         if (buf[0] == '1')
3196                 s->flags |= SLAB_POISON;
3197         calculate_sizes(s);
3198         return length;
3199 }
3200 SLAB_ATTR(poison);
3201
3202 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3203 {
3204         return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3205 }
3206
3207 static ssize_t store_user_store(struct kmem_cache *s,
3208                                 const char *buf, size_t length)
3209 {
3210         if (any_slab_objects(s))
3211                 return -EBUSY;
3212
3213         s->flags &= ~SLAB_STORE_USER;
3214         if (buf[0] == '1')
3215                 s->flags |= SLAB_STORE_USER;
3216         calculate_sizes(s);
3217         return length;
3218 }
3219 SLAB_ATTR(store_user);
3220
3221 static ssize_t validate_show(struct kmem_cache *s, char *buf)
3222 {
3223         return 0;
3224 }
3225
3226 static ssize_t validate_store(struct kmem_cache *s,
3227                         const char *buf, size_t length)
3228 {
3229         if (buf[0] == '1')
3230                 validate_slab_cache(s);
3231         else
3232                 return -EINVAL;
3233         return length;
3234 }
3235 SLAB_ATTR(validate);
3236
3237 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
3238 {
3239         return 0;
3240 }
3241
3242 static ssize_t shrink_store(struct kmem_cache *s,
3243                         const char *buf, size_t length)
3244 {
3245         if (buf[0] == '1') {
3246                 int rc = kmem_cache_shrink(s);
3247
3248                 if (rc)
3249                         return rc;
3250         } else
3251                 return -EINVAL;
3252         return length;
3253 }
3254 SLAB_ATTR(shrink);
3255
3256 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
3257 {
3258         if (!(s->flags & SLAB_STORE_USER))
3259                 return -ENOSYS;
3260         return list_locations(s, buf, TRACK_ALLOC);
3261 }
3262 SLAB_ATTR_RO(alloc_calls);
3263
3264 static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3265 {
3266         if (!(s->flags & SLAB_STORE_USER))
3267                 return -ENOSYS;
3268         return list_locations(s, buf, TRACK_FREE);
3269 }
3270 SLAB_ATTR_RO(free_calls);
3271
3272 #ifdef CONFIG_NUMA
3273 static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
3274 {
3275         return sprintf(buf, "%d\n", s->defrag_ratio / 10);
3276 }
3277
3278 static ssize_t defrag_ratio_store(struct kmem_cache *s,
3279                                 const char *buf, size_t length)
3280 {
3281         int n = simple_strtoul(buf, NULL, 10);
3282
3283         if (n < 100)
3284                 s->defrag_ratio = n * 10;
3285         return length;
3286 }
3287 SLAB_ATTR(defrag_ratio);
3288 #endif
3289
3290 static struct attribute * slab_attrs[] = {
3291         &slab_size_attr.attr,
3292         &object_size_attr.attr,
3293         &objs_per_slab_attr.attr,
3294         &order_attr.attr,
3295         &objects_attr.attr,
3296         &slabs_attr.attr,
3297         &partial_attr.attr,
3298         &cpu_slabs_attr.attr,
3299         &ctor_attr.attr,
3300         &dtor_attr.attr,
3301         &aliases_attr.attr,
3302         &align_attr.attr,
3303         &sanity_checks_attr.attr,
3304         &trace_attr.attr,
3305         &hwcache_align_attr.attr,
3306         &reclaim_account_attr.attr,
3307         &destroy_by_rcu_attr.attr,
3308         &red_zone_attr.attr,
3309         &poison_attr.attr,
3310         &store_user_attr.attr,
3311         &validate_attr.attr,
3312         &shrink_attr.attr,
3313         &alloc_calls_attr.attr,
3314         &free_calls_attr.attr,
3315 #ifdef CONFIG_ZONE_DMA
3316         &cache_dma_attr.attr,
3317 #endif
3318 #ifdef CONFIG_NUMA
3319         &defrag_ratio_attr.attr,
3320 #endif
3321         NULL
3322 };
3323
3324 static struct attribute_group slab_attr_group = {
3325         .attrs = slab_attrs,
3326 };
3327
3328 static ssize_t slab_attr_show(struct kobject *kobj,
3329                                 struct attribute *attr,
3330                                 char *buf)
3331 {
3332         struct slab_attribute *attribute;
3333         struct kmem_cache *s;
3334         int err;
3335
3336         attribute = to_slab_attr(attr);
3337         s = to_slab(kobj);
3338
3339         if (!attribute->show)
3340                 return -EIO;
3341
3342         err = attribute->show(s, buf);
3343
3344         return err;
3345 }
3346
3347 static ssize_t slab_attr_store(struct kobject *kobj,
3348                                 struct attribute *attr,
3349                                 const char *buf, size_t len)
3350 {
3351         struct slab_attribute *attribute;
3352         struct kmem_cache *s;
3353         int err;
3354
3355         attribute = to_slab_attr(attr);
3356         s = to_slab(kobj);
3357
3358         if (!attribute->store)
3359                 return -EIO;
3360
3361         err = attribute->store(s, buf, len);
3362
3363         return err;
3364 }
3365
3366 static struct sysfs_ops slab_sysfs_ops = {
3367         .show = slab_attr_show,
3368         .store = slab_attr_store,
3369 };
3370
3371 static struct kobj_type slab_ktype = {
3372         .sysfs_ops = &slab_sysfs_ops,
3373 };
3374
3375 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3376 {
3377         struct kobj_type *ktype = get_ktype(kobj);
3378
3379         if (ktype == &slab_ktype)
3380                 return 1;
3381         return 0;
3382 }
3383
3384 static struct kset_uevent_ops slab_uevent_ops = {
3385         .filter = uevent_filter,
3386 };
3387
3388 decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
3389
3390 #define ID_STR_LENGTH 64
3391
3392 /* Create a unique string id for a slab cache:
3393  * format
3394  * :[flags-]size:[memory address of kmemcache]
3395  */
3396 static char *create_unique_id(struct kmem_cache *s)
3397 {
3398         char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
3399         char *p = name;
3400
3401         BUG_ON(!name);
3402
3403         *p++ = ':';
3404         /*
3405          * First flags affecting slabcache operations. We will only
3406          * get here for aliasable slabs so we do not need to support
3407          * too many flags. The flags here must cover all flags that
3408          * are matched during merging to guarantee that the id is
3409          * unique.
3410          */
3411         if (s->flags & SLAB_CACHE_DMA)
3412                 *p++ = 'd';
3413         if (s->flags & SLAB_RECLAIM_ACCOUNT)
3414                 *p++ = 'a';
3415         if (s->flags & SLAB_DEBUG_FREE)
3416                 *p++ = 'F';
3417         if (p != name + 1)
3418                 *p++ = '-';
3419         p += sprintf(p, "%07d", s->size);
3420         BUG_ON(p > name + ID_STR_LENGTH - 1);
3421         return name;
3422 }
3423
3424 static int sysfs_slab_add(struct kmem_cache *s)
3425 {
3426         int err;
3427         const char *name;
3428         int unmergeable;
3429
3430         if (slab_state < SYSFS)
3431                 /* Defer until later */
3432                 return 0;
3433
3434         unmergeable = slab_unmergeable(s);
3435         if (unmergeable) {
3436                 /*
3437                  * Slabcache can never be merged so we can use the name proper.
3438                  * This is typically the case for debug situations. In that
3439                  * case we can catch duplicate names easily.
3440                  */
3441                 sysfs_remove_link(&slab_subsys.kset.kobj, s->name);
3442                 name = s->name;
3443         } else {
3444                 /*
3445                  * Create a unique name for the slab as a target
3446                  * for the symlinks.
3447                  */
3448                 name = create_unique_id(s);
3449         }
3450
3451         kobj_set_kset_s(s, slab_subsys);
3452         kobject_set_name(&s->kobj, name);
3453         kobject_init(&s->kobj);
3454         err = kobject_add(&s->kobj);
3455         if (err)
3456                 return err;
3457
3458         err = sysfs_create_group(&s->kobj, &slab_attr_group);
3459         if (err)
3460                 return err;
3461         kobject_uevent(&s->kobj, KOBJ_ADD);
3462         if (!unmergeable) {
3463                 /* Setup first alias */
3464                 sysfs_slab_alias(s, s->name);
3465                 kfree(name);
3466         }
3467         return 0;
3468 }
3469
3470 static void sysfs_slab_remove(struct kmem_cache *s)
3471 {
3472         kobject_uevent(&s->kobj, KOBJ_REMOVE);
3473         kobject_del(&s->kobj);
3474 }
3475
3476 /*
3477  * Need to buffer aliases during bootup until sysfs becomes
3478  * available lest we loose that information.
3479  */
3480 struct saved_alias {
3481         struct kmem_cache *s;
3482         const char *name;
3483         struct saved_alias *next;
3484 };
3485
3486 struct saved_alias *alias_list;
3487
3488 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
3489 {
3490         struct saved_alias *al;
3491
3492         if (slab_state == SYSFS) {
3493                 /*
3494                  * If we have a leftover link then remove it.
3495                  */
3496                 sysfs_remove_link(&slab_subsys.kset.kobj, name);
3497                 return sysfs_create_link(&slab_subsys.kset.kobj,
3498                                                 &s->kobj, name);
3499         }
3500
3501         al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
3502         if (!al)
3503                 return -ENOMEM;
3504
3505         al->s = s;
3506         al->name = name;
3507         al->next = alias_list;
3508         alias_list = al;
3509         return 0;
3510 }
3511
3512 static int __init slab_sysfs_init(void)
3513 {
3514         int err;
3515
3516         err = subsystem_register(&slab_subsys);
3517         if (err) {
3518                 printk(KERN_ERR "Cannot register slab subsystem.\n");
3519                 return -ENOSYS;
3520         }
3521
3522         finish_bootstrap();
3523
3524         while (alias_list) {
3525                 struct saved_alias *al = alias_list;
3526
3527                 alias_list = alias_list->next;
3528                 err = sysfs_slab_alias(al->s, al->name);
3529                 BUG_ON(err);
3530                 kfree(al);
3531         }
3532
3533         resiliency_test();
3534         return 0;
3535 }
3536
3537 __initcall(slab_sysfs_init);
3538 #else
3539 __initcall(finish_bootstrap);
3540 #endif