kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mm.h>
  11 #include <linux/file.h>
  12 #include <linux/slab.h>
  13 #include <linux/fs.h>
  14 #include <linux/kexec.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/list.h>
  17 #include <linux/highmem.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/reboot.h>
  20 #include <linux/syscalls.h>
  21 #include <linux/ioport.h>
  22 #include <linux/hardirq.h>
  23 #include <linux/elf.h>
  24 #include <linux/elfcore.h>
  25
  26 #include <asm/page.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/io.h>
  29 #include <asm/system.h>
  30 #include <asm/semaphore.h>
  31
  32 /* Per cpu memory for storing cpu states in case of system crash. */
  33 note_buf_t* crash_notes;
  34
  35 /* Location of the reserved area for the crash kernel */
  36 struct resource crashk_res = {
  37         .name  = "Crash kernel",
  38         .start = 0,
  39         .end   = 0,
  40         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  41 };
  42
  43 int kexec_should_crash(struct task_struct *p)
  44 {
  45         if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
  46                 return 1;
  47         return 0;
  48 }
  49
  50 /*
  51  * When kexec transitions to the new kernel there is a one-to-one
  52  * mapping between physical and virtual addresses.  On processors
  53  * where you can disable the MMU this is trivial, and easy.  For
  54  * others it is still a simple predictable page table to setup.
  55  *
  56  * In that environment kexec copies the new kernel to its final
  57  * resting place.  This means I can only support memory whose
  58  * physical address can fit in an unsigned long.  In particular
  59  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  60  * If the assembly stub has more restrictive requirements
  61  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  62  * defined more restrictively in <asm/kexec.h>.
  63  *
  64  * The code for the transition from the current kernel to the
  65  * the new kernel is placed in the control_code_buffer, whose size
  66  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  67  * page of memory is necessary, but some architectures require more.
  68  * Because this memory must be identity mapped in the transition from
  69  * virtual to physical addresses it must live in the range
  70  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  71  * modifiable.
  72  *
  73  * The assembly stub in the control code buffer is passed a linked list
  74  * of descriptor pages detailing the source pages of the new kernel,
  75  * and the destination addresses of those source pages.  As this data
  76  * structure is not used in the context of the current OS, it must
  77  * be self-contained.
  78  *
  79  * The code has been made to work with highmem pages and will use a
  80  * destination page in its final resting place (if it happens
  81  * to allocate it).  The end product of this is that most of the
  82  * physical address space, and most of RAM can be used.
  83  *
  84  * Future directions include:
  85  *  - allocating a page table with the control code buffer identity
  86  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  87  *    reliable.
  88  */
  89
  90 /*
  91  * KIMAGE_NO_DEST is an impossible destination address..., for
  92  * allocating pages whose destination address we do not care about.
  93  */
  94 #define KIMAGE_NO_DEST (-1UL)
  95
  96 static int kimage_is_destination_range(struct kimage *image,
  97                                        unsigned long start, unsigned long end);
  98 static struct page *kimage_alloc_page(struct kimage *image,
  99                                        gfp_t gfp_mask,
 100                                        unsigned long dest);
 101
 102 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 103                             unsigned long nr_segments,
 104                             struct kexec_segment __user *segments)
 105 {
 106         size_t segment_bytes;
 107         struct kimage *image;
 108         unsigned long i;
 109         int result;
 110
 111         /* Allocate a controlling structure */
 112         result = -ENOMEM;
 113         image = kzalloc(sizeof(*image), GFP_KERNEL);
 114         if (!image)
 115                 goto out;
 116
 117         image->head = 0;
 118         image->entry = &image->head;
 119         image->last_entry = &image->head;
 120         image->control_page = ~0; /* By default this does not apply */
 121         image->start = entry;
 122         image->type = KEXEC_TYPE_DEFAULT;
 123
 124         /* Initialize the list of control pages */
 125         INIT_LIST_HEAD(&image->control_pages);
 126
 127         /* Initialize the list of destination pages */
 128         INIT_LIST_HEAD(&image->dest_pages);
 129
 130         /* Initialize the list of unuseable pages */
 131         INIT_LIST_HEAD(&image->unuseable_pages);
 132
 133         /* Read in the segments */
 134         image->nr_segments = nr_segments;
 135         segment_bytes = nr_segments * sizeof(*segments);
 136         result = copy_from_user(image->segment, segments, segment_bytes);
 137         if (result)
 138                 goto out;
 139
 140         /*
 141          * Verify we have good destination addresses.  The caller is
 142          * responsible for making certain we don't attempt to load
 143          * the new image into invalid or reserved areas of RAM.  This
 144          * just verifies it is an address we can use.
 145          *
 146          * Since the kernel does everything in page size chunks ensure
 147          * the destination addreses are page aligned.  Too many
 148          * special cases crop of when we don't do this.  The most
 149          * insidious is getting overlapping destination addresses
 150          * simply because addresses are changed to page size
 151          * granularity.
 152          */
 153         result = -EADDRNOTAVAIL;
 154         for (i = 0; i < nr_segments; i++) {
 155                 unsigned long mstart, mend;
 156
 157                 mstart = image->segment[i].mem;
 158                 mend   = mstart + image->segment[i].memsz;
 159                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 160                         goto out;
 161                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 162                         goto out;
 163         }
 164
 165         /* Verify our destination addresses do not overlap.
 166          * If we alloed overlapping destination addresses
 167          * through very weird things can happen with no
 168          * easy explanation as one segment stops on another.
 169          */
 170         result = -EINVAL;
 171         for (i = 0; i < nr_segments; i++) {
 172                 unsigned long mstart, mend;
 173                 unsigned long j;
 174
 175                 mstart = image->segment[i].mem;
 176                 mend   = mstart + image->segment[i].memsz;
 177                 for (j = 0; j < i; j++) {
 178                         unsigned long pstart, pend;
 179                         pstart = image->segment[j].mem;
 180                         pend   = pstart + image->segment[j].memsz;
 181                         /* Do the segments overlap ? */
 182                         if ((mend > pstart) && (mstart < pend))
 183                                 goto out;
 184                 }
 185         }
 186
 187         /* Ensure our buffer sizes are strictly less than
 188          * our memory sizes.  This should always be the case,
 189          * and it is easier to check up front than to be surprised
 190          * later on.
 191          */
 192         result = -EINVAL;
 193         for (i = 0; i < nr_segments; i++) {
 194                 if (image->segment[i].bufsz > image->segment[i].memsz)
 195                         goto out;
 196         }
 197
 198         result = 0;
 199 out:
 200         if (result == 0)
 201                 *rimage = image;
 202         else
 203                 kfree(image);
 204
 205         return result;
 206
 207 }
 208
 209 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 210                                 unsigned long nr_segments,
 211                                 struct kexec_segment __user *segments)
 212 {
 213         int result;
 214         struct kimage *image;
 215
 216         /* Allocate and initialize a controlling structure */
 217         image = NULL;
 218         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 219         if (result)
 220                 goto out;
 221
 222         *rimage = image;
 223
 224         /*
 225          * Find a location for the control code buffer, and add it
 226          * the vector of segments so that it's pages will also be
 227          * counted as destination pages.
 228          */
 229         result = -ENOMEM;
 230         image->control_code_page = kimage_alloc_control_pages(image,
 231                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 232         if (!image->control_code_page) {
 233                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 234                 goto out;
 235         }
 236
 237         result = 0;
 238  out:
 239         if (result == 0)
 240                 *rimage = image;
 241         else
 242                 kfree(image);
 243
 244         return result;
 245 }
 246
 247 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 248                                 unsigned long nr_segments,
 249                                 struct kexec_segment __user *segments)
 250 {
 251         int result;
 252         struct kimage *image;
 253         unsigned long i;
 254
 255         image = NULL;
 256         /* Verify we have a valid entry point */
 257         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 258                 result = -EADDRNOTAVAIL;
 259                 goto out;
 260         }
 261
 262         /* Allocate and initialize a controlling structure */
 263         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 264         if (result)
 265                 goto out;
 266
 267         /* Enable the special crash kernel control page
 268          * allocation policy.
 269          */
 270         image->control_page = crashk_res.start;
 271         image->type = KEXEC_TYPE_CRASH;
 272
 273         /*
 274          * Verify we have good destination addresses.  Normally
 275          * the caller is responsible for making certain we don't
 276          * attempt to load the new image into invalid or reserved
 277          * areas of RAM.  But crash kernels are preloaded into a
 278          * reserved area of ram.  We must ensure the addresses
 279          * are in the reserved area otherwise preloading the
 280          * kernel could corrupt things.
 281          */
 282         result = -EADDRNOTAVAIL;
 283         for (i = 0; i < nr_segments; i++) {
 284                 unsigned long mstart, mend;
 285
 286                 mstart = image->segment[i].mem;
 287                 mend = mstart + image->segment[i].memsz - 1;
 288                 /* Ensure we are within the crash kernel limits */
 289                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 290                         goto out;
 291         }
 292
 293         /*
 294          * Find a location for the control code buffer, and add
 295          * the vector of segments so that it's pages will also be
 296          * counted as destination pages.
 297          */
 298         result = -ENOMEM;
 299         image->control_code_page = kimage_alloc_control_pages(image,
 300                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 301         if (!image->control_code_page) {
 302                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 303                 goto out;
 304         }
 305
 306         result = 0;
 307 out:
 308         if (result == 0)
 309                 *rimage = image;
 310         else
 311                 kfree(image);
 312
 313         return result;
 314 }
 315
 316 static int kimage_is_destination_range(struct kimage *image,
 317                                         unsigned long start,
 318                                         unsigned long end)
 319 {
 320         unsigned long i;
 321
 322         for (i = 0; i < image->nr_segments; i++) {
 323                 unsigned long mstart, mend;
 324
 325                 mstart = image->segment[i].mem;
 326                 mend = mstart + image->segment[i].memsz;
 327                 if ((end > mstart) && (start < mend))
 328                         return 1;
 329         }
 330
 331         return 0;
 332 }
 333
 334 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 335 {
 336         struct page *pages;
 337
 338         pages = alloc_pages(gfp_mask, order);
 339         if (pages) {
 340                 unsigned int count, i;
 341                 pages->mapping = NULL;
 342                 set_page_private(pages, order);
 343                 count = 1 << order;
 344                 for (i = 0; i < count; i++)
 345                         SetPageReserved(pages + i);
 346         }
 347
 348         return pages;
 349 }
 350
 351 static void kimage_free_pages(struct page *page)
 352 {
 353         unsigned int order, count, i;
 354
 355         order = page_private(page);
 356         count = 1 << order;
 357         for (i = 0; i < count; i++)
 358                 ClearPageReserved(page + i);
 359         __free_pages(page, order);
 360 }
 361
 362 static void kimage_free_page_list(struct list_head *list)
 363 {
 364         struct list_head *pos, *next;
 365
 366         list_for_each_safe(pos, next, list) {
 367                 struct page *page;
 368
 369                 page = list_entry(pos, struct page, lru);
 370                 list_del(&page->lru);
 371                 kimage_free_pages(page);
 372         }
 373 }
 374
 375 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 376                                                         unsigned int order)
 377 {
 378         /* Control pages are special, they are the intermediaries
 379          * that are needed while we copy the rest of the pages
 380          * to their final resting place.  As such they must
 381          * not conflict with either the destination addresses
 382          * or memory the kernel is already using.
 383          *
 384          * The only case where we really need more than one of
 385          * these are for architectures where we cannot disable
 386          * the MMU and must instead generate an identity mapped
 387          * page table for all of the memory.
 388          *
 389          * At worst this runs in O(N) of the image size.
 390          */
 391         struct list_head extra_pages;
 392         struct page *pages;
 393         unsigned int count;
 394
 395         count = 1 << order;
 396         INIT_LIST_HEAD(&extra_pages);
 397
 398         /* Loop while I can allocate a page and the page allocated
 399          * is a destination page.
 400          */
 401         do {
 402                 unsigned long pfn, epfn, addr, eaddr;
 403
 404                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 405                 if (!pages)
 406                         break;
 407                 pfn   = page_to_pfn(pages);
 408                 epfn  = pfn + count;
 409                 addr  = pfn << PAGE_SHIFT;
 410                 eaddr = epfn << PAGE_SHIFT;
 411                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 412                               kimage_is_destination_range(image, addr, eaddr)) {
 413                         list_add(&pages->lru, &extra_pages);
 414                         pages = NULL;
 415                 }
 416         } while (!pages);
 417
 418         if (pages) {
 419                 /* Remember the allocated page... */
 420                 list_add(&pages->lru, &image->control_pages);
 421
 422                 /* Because the page is already in it's destination
 423                  * location we will never allocate another page at
 424                  * that address.  Therefore kimage_alloc_pages
 425                  * will not return it (again) and we don't need
 426                  * to give it an entry in image->segment[].
 427                  */
 428         }
 429         /* Deal with the destination pages I have inadvertently allocated.
 430          *
 431          * Ideally I would convert multi-page allocations into single
 432          * page allocations, and add everyting to image->dest_pages.
 433          *
 434          * For now it is simpler to just free the pages.
 435          */
 436         kimage_free_page_list(&extra_pages);
 437
 438         return pages;
 439 }
 440
 441 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 442                                                       unsigned int order)
 443 {
 444         /* Control pages are special, they are the intermediaries
 445          * that are needed while we copy the rest of the pages
 446          * to their final resting place.  As such they must
 447          * not conflict with either the destination addresses
 448          * or memory the kernel is already using.
 449          *
 450          * Control pages are also the only pags we must allocate
 451          * when loading a crash kernel.  All of the other pages
 452          * are specified by the segments and we just memcpy
 453          * into them directly.
 454          *
 455          * The only case where we really need more than one of
 456          * these are for architectures where we cannot disable
 457          * the MMU and must instead generate an identity mapped
 458          * page table for all of the memory.
 459          *
 460          * Given the low demand this implements a very simple
 461          * allocator that finds the first hole of the appropriate
 462          * size in the reserved memory region, and allocates all
 463          * of the memory up to and including the hole.
 464          */
 465         unsigned long hole_start, hole_end, size;
 466         struct page *pages;
 467
 468         pages = NULL;
 469         size = (1 << order) << PAGE_SHIFT;
 470         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 471         hole_end   = hole_start + size - 1;
 472         while (hole_end <= crashk_res.end) {
 473                 unsigned long i;
 474
 475                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 476                         break;
 477                 if (hole_end > crashk_res.end)
 478                         break;
 479                 /* See if I overlap any of the segments */
 480                 for (i = 0; i < image->nr_segments; i++) {
 481                         unsigned long mstart, mend;
 482
 483                         mstart = image->segment[i].mem;
 484                         mend   = mstart + image->segment[i].memsz - 1;
 485                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 486                                 /* Advance the hole to the end of the segment */
 487                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 488                                 hole_end   = hole_start + size - 1;
 489                                 break;
 490                         }
 491                 }
 492                 /* If I don't overlap any segments I have found my hole! */
 493                 if (i == image->nr_segments) {
 494                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 495                         break;
 496                 }
 497         }
 498         if (pages)
 499                 image->control_page = hole_end;
 500
 501         return pages;
 502 }
 503
 504
 505 struct page *kimage_alloc_control_pages(struct kimage *image,
 506                                          unsigned int order)
 507 {
 508         struct page *pages = NULL;
 509
 510         switch (image->type) {
 511         case KEXEC_TYPE_DEFAULT:
 512                 pages = kimage_alloc_normal_control_pages(image, order);
 513                 break;
 514         case KEXEC_TYPE_CRASH:
 515                 pages = kimage_alloc_crash_control_pages(image, order);
 516                 break;
 517         }
 518
 519         return pages;
 520 }
 521
 522 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 523 {
 524         if (*image->entry != 0)
 525                 image->entry++;
 526
 527         if (image->entry == image->last_entry) {
 528                 kimage_entry_t *ind_page;
 529                 struct page *page;
 530
 531                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 532                 if (!page)
 533                         return -ENOMEM;
 534
 535                 ind_page = page_address(page);
 536                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 537                 image->entry = ind_page;
 538                 image->last_entry = ind_page +
 539                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 540         }
 541         *image->entry = entry;
 542         image->entry++;
 543         *image->entry = 0;
 544
 545         return 0;
 546 }
 547
 548 static int kimage_set_destination(struct kimage *image,
 549                                    unsigned long destination)
 550 {
 551         int result;
 552
 553         destination &= PAGE_MASK;
 554         result = kimage_add_entry(image, destination | IND_DESTINATION);
 555         if (result == 0)
 556                 image->destination = destination;
 557
 558         return result;
 559 }
 560
 561
 562 static int kimage_add_page(struct kimage *image, unsigned long page)
 563 {
 564         int result;
 565
 566         page &= PAGE_MASK;
 567         result = kimage_add_entry(image, page | IND_SOURCE);
 568         if (result == 0)
 569                 image->destination += PAGE_SIZE;
 570
 571         return result;
 572 }
 573
 574
 575 static void kimage_free_extra_pages(struct kimage *image)
 576 {
 577         /* Walk through and free any extra destination pages I may have */
 578         kimage_free_page_list(&image->dest_pages);
 579
 580         /* Walk through and free any unuseable pages I have cached */
 581         kimage_free_page_list(&image->unuseable_pages);
 582
 583 }
 584 static int kimage_terminate(struct kimage *image)
 585 {
 586         if (*image->entry != 0)
 587                 image->entry++;
 588
 589         *image->entry = IND_DONE;
 590
 591         return 0;
 592 }
 593
 594 #define for_each_kimage_entry(image, ptr, entry) \
 595         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 596                 ptr = (entry & IND_INDIRECTION)? \
 597                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 598
 599 static void kimage_free_entry(kimage_entry_t entry)
 600 {
 601         struct page *page;
 602
 603         page = pfn_to_page(entry >> PAGE_SHIFT);
 604         kimage_free_pages(page);
 605 }
 606
 607 static void kimage_free(struct kimage *image)
 608 {
 609         kimage_entry_t *ptr, entry;
 610         kimage_entry_t ind = 0;
 611
 612         if (!image)
 613                 return;
 614
 615         kimage_free_extra_pages(image);
 616         for_each_kimage_entry(image, ptr, entry) {
 617                 if (entry & IND_INDIRECTION) {
 618                         /* Free the previous indirection page */
 619                         if (ind & IND_INDIRECTION)
 620                                 kimage_free_entry(ind);
 621                         /* Save this indirection page until we are
 622                          * done with it.
 623                          */
 624                         ind = entry;
 625                 }
 626                 else if (entry & IND_SOURCE)
 627                         kimage_free_entry(entry);
 628         }
 629         /* Free the final indirection page */
 630         if (ind & IND_INDIRECTION)
 631                 kimage_free_entry(ind);
 632
 633         /* Handle any machine specific cleanup */
 634         machine_kexec_cleanup(image);
 635
 636         /* Free the kexec control pages... */
 637         kimage_free_page_list(&image->control_pages);
 638         kfree(image);
 639 }
 640
 641 static kimage_entry_t *kimage_dst_used(struct kimage *image,
 642                                         unsigned long page)
 643 {
 644         kimage_entry_t *ptr, entry;
 645         unsigned long destination = 0;
 646
 647         for_each_kimage_entry(image, ptr, entry) {
 648                 if (entry & IND_DESTINATION)
 649                         destination = entry & PAGE_MASK;
 650                 else if (entry & IND_SOURCE) {
 651                         if (page == destination)
 652                                 return ptr;
 653                         destination += PAGE_SIZE;
 654                 }
 655         }
 656
 657         return NULL;
 658 }
 659
 660 static struct page *kimage_alloc_page(struct kimage *image,
 661                                         gfp_t gfp_mask,
 662                                         unsigned long destination)
 663 {
 664         /*
 665          * Here we implement safeguards to ensure that a source page
 666          * is not copied to its destination page before the data on
 667          * the destination page is no longer useful.
 668          *
 669          * To do this we maintain the invariant that a source page is
 670          * either its own destination page, or it is not a
 671          * destination page at all.
 672          *
 673          * That is slightly stronger than required, but the proof
 674          * that no problems will not occur is trivial, and the
 675          * implementation is simply to verify.
 676          *
 677          * When allocating all pages normally this algorithm will run
 678          * in O(N) time, but in the worst case it will run in O(N^2)
 679          * time.   If the runtime is a problem the data structures can
 680          * be fixed.
 681          */
 682         struct page *page;
 683         unsigned long addr;
 684
 685         /*
 686          * Walk through the list of destination pages, and see if I
 687          * have a match.
 688          */
 689         list_for_each_entry(page, &image->dest_pages, lru) {
 690                 addr = page_to_pfn(page) << PAGE_SHIFT;
 691                 if (addr == destination) {
 692                         list_del(&page->lru);
 693                         return page;
 694                 }
 695         }
 696         page = NULL;
 697         while (1) {
 698                 kimage_entry_t *old;
 699
 700                 /* Allocate a page, if we run out of memory give up */
 701                 page = kimage_alloc_pages(gfp_mask, 0);
 702                 if (!page)
 703                         return NULL;
 704                 /* If the page cannot be used file it away */
 705                 if (page_to_pfn(page) >
 706                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 707                         list_add(&page->lru, &image->unuseable_pages);
 708                         continue;
 709                 }
 710                 addr = page_to_pfn(page) << PAGE_SHIFT;
 711
 712                 /* If it is the destination page we want use it */
 713                 if (addr == destination)
 714                         break;
 715
 716                 /* If the page is not a destination page use it */
 717                 if (!kimage_is_destination_range(image, addr,
 718                                                   addr + PAGE_SIZE))
 719                         break;
 720
 721                 /*
 722                  * I know that the page is someones destination page.
 723                  * See if there is already a source page for this
 724                  * destination page.  And if so swap the source pages.
 725                  */
 726                 old = kimage_dst_used(image, addr);
 727                 if (old) {
 728                         /* If so move it */
 729                         unsigned long old_addr;
 730                         struct page *old_page;
 731
 732                         old_addr = *old & PAGE_MASK;
 733                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 734                         copy_highpage(page, old_page);
 735                         *old = addr | (*old & ~PAGE_MASK);
 736
 737                         /* The old page I have found cannot be a
 738                          * destination page, so return it.
 739                          */
 740                         addr = old_addr;
 741                         page = old_page;
 742                         break;
 743                 }
 744                 else {
 745                         /* Place the page on the destination list I
 746                          * will use it later.
 747                          */
 748                         list_add(&page->lru, &image->dest_pages);
 749                 }
 750         }
 751
 752         return page;
 753 }
 754
 755 static int kimage_load_normal_segment(struct kimage *image,
 756                                          struct kexec_segment *segment)
 757 {
 758         unsigned long maddr;
 759         unsigned long ubytes, mbytes;
 760         int result;
 761         unsigned char __user *buf;
 762
 763         result = 0;
 764         buf = segment->buf;
 765         ubytes = segment->bufsz;
 766         mbytes = segment->memsz;
 767         maddr = segment->mem;
 768
 769         result = kimage_set_destination(image, maddr);
 770         if (result < 0)
 771                 goto out;
 772
 773         while (mbytes) {
 774                 struct page *page;
 775                 char *ptr;
 776                 size_t uchunk, mchunk;
 777
 778                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 779                 if (page == 0) {
 780                         result  = -ENOMEM;
 781                         goto out;
 782                 }
 783                 result = kimage_add_page(image, page_to_pfn(page)
 784                                                                 << PAGE_SHIFT);
 785                 if (result < 0)
 786                         goto out;
 787
 788                 ptr = kmap(page);
 789                 /* Start with a clear page */
 790                 memset(ptr, 0, PAGE_SIZE);
 791                 ptr += maddr & ~PAGE_MASK;
 792                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 793                 if (mchunk > mbytes)
 794                         mchunk = mbytes;
 795
 796                 uchunk = mchunk;
 797                 if (uchunk > ubytes)
 798                         uchunk = ubytes;
 799
 800                 result = copy_from_user(ptr, buf, uchunk);
 801                 kunmap(page);
 802                 if (result) {
 803                         result = (result < 0) ? result : -EIO;
 804                         goto out;
 805                 }
 806                 ubytes -= uchunk;
 807                 maddr  += mchunk;
 808                 buf    += mchunk;
 809                 mbytes -= mchunk;
 810         }
 811 out:
 812         return result;
 813 }
 814
 815 static int kimage_load_crash_segment(struct kimage *image,
 816                                         struct kexec_segment *segment)
 817 {
 818         /* For crash dumps kernels we simply copy the data from
 819          * user space to it's destination.
 820          * We do things a page at a time for the sake of kmap.
 821          */
 822         unsigned long maddr;
 823         unsigned long ubytes, mbytes;
 824         int result;
 825         unsigned char __user *buf;
 826
 827         result = 0;
 828         buf = segment->buf;
 829         ubytes = segment->bufsz;
 830         mbytes = segment->memsz;
 831         maddr = segment->mem;
 832         while (mbytes) {
 833                 struct page *page;
 834                 char *ptr;
 835                 size_t uchunk, mchunk;
 836
 837                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 838                 if (page == 0) {
 839                         result  = -ENOMEM;
 840                         goto out;
 841                 }
 842                 ptr = kmap(page);
 843                 ptr += maddr & ~PAGE_MASK;
 844                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 845                 if (mchunk > mbytes)
 846                         mchunk = mbytes;
 847
 848                 uchunk = mchunk;
 849                 if (uchunk > ubytes) {
 850                         uchunk = ubytes;
 851                         /* Zero the trailing part of the page */
 852                         memset(ptr + uchunk, 0, mchunk - uchunk);
 853                 }
 854                 result = copy_from_user(ptr, buf, uchunk);
 855                 kunmap(page);
 856                 if (result) {
 857                         result = (result < 0) ? result : -EIO;
 858                         goto out;
 859                 }
 860                 ubytes -= uchunk;
 861                 maddr  += mchunk;
 862                 buf    += mchunk;
 863                 mbytes -= mchunk;
 864         }
 865 out:
 866         return result;
 867 }
 868
 869 static int kimage_load_segment(struct kimage *image,
 870                                 struct kexec_segment *segment)
 871 {
 872         int result = -ENOMEM;
 873
 874         switch (image->type) {
 875         case KEXEC_TYPE_DEFAULT:
 876                 result = kimage_load_normal_segment(image, segment);
 877                 break;
 878         case KEXEC_TYPE_CRASH:
 879                 result = kimage_load_crash_segment(image, segment);
 880                 break;
 881         }
 882
 883         return result;
 884 }
 885
 886 /*
 887  * Exec Kernel system call: for obvious reasons only root may call it.
 888  *
 889  * This call breaks up into three pieces.
 890  * - A generic part which loads the new kernel from the current
 891  *   address space, and very carefully places the data in the
 892  *   allocated pages.
 893  *
 894  * - A generic part that interacts with the kernel and tells all of
 895  *   the devices to shut down.  Preventing on-going dmas, and placing
 896  *   the devices in a consistent state so a later kernel can
 897  *   reinitialize them.
 898  *
 899  * - A machine specific part that includes the syscall number
 900  *   and the copies the image to it's final destination.  And
 901  *   jumps into the image at entry.
 902  *
 903  * kexec does not sync, or unmount filesystems so if you need
 904  * that to happen you need to do that yourself.
 905  */
 906 struct kimage *kexec_image;
 907 struct kimage *kexec_crash_image;
 908 /*
 909  * A home grown binary mutex.
 910  * Nothing can wait so this mutex is safe to use
 911  * in interrupt context :)
 912  */
 913 static int kexec_lock;
 914
 915 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 916                                 struct kexec_segment __user *segments,
 917                                 unsigned long flags)
 918 {
 919         struct kimage **dest_image, *image;
 920         int locked;
 921         int result;
 922
 923         /* We only trust the superuser with rebooting the system. */
 924         if (!capable(CAP_SYS_BOOT))
 925                 return -EPERM;
 926
 927         /*
 928          * Verify we have a legal set of flags
 929          * This leaves us room for future extensions.
 930          */
 931         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 932                 return -EINVAL;
 933
 934         /* Verify we are on the appropriate architecture */
 935         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 936                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 937                 return -EINVAL;
 938
 939         /* Put an artificial cap on the number
 940          * of segments passed to kexec_load.
 941          */
 942         if (nr_segments > KEXEC_SEGMENT_MAX)
 943                 return -EINVAL;
 944
 945         image = NULL;
 946         result = 0;
 947
 948         /* Because we write directly to the reserved memory
 949          * region when loading crash kernels we need a mutex here to
 950          * prevent multiple crash  kernels from attempting to load
 951          * simultaneously, and to prevent a crash kernel from loading
 952          * over the top of a in use crash kernel.
 953          *
 954          * KISS: always take the mutex.
 955          */
 956         locked = xchg(&kexec_lock, 1);
 957         if (locked)
 958                 return -EBUSY;
 959
 960         dest_image = &kexec_image;
 961         if (flags & KEXEC_ON_CRASH)
 962                 dest_image = &kexec_crash_image;
 963         if (nr_segments > 0) {
 964                 unsigned long i;
 965
 966                 /* Loading another kernel to reboot into */
 967                 if ((flags & KEXEC_ON_CRASH) == 0)
 968                         result = kimage_normal_alloc(&image, entry,
 969                                                         nr_segments, segments);
 970                 /* Loading another kernel to switch to if this one crashes */
 971                 else if (flags & KEXEC_ON_CRASH) {
 972                         /* Free any current crash dump kernel before
 973                          * we corrupt it.
 974                          */
 975                         kimage_free(xchg(&kexec_crash_image, NULL));
 976                         result = kimage_crash_alloc(&image, entry,
 977                                                      nr_segments, segments);
 978                 }
 979                 if (result)
 980                         goto out;
 981
 982                 result = machine_kexec_prepare(image);
 983                 if (result)
 984                         goto out;
 985
 986                 for (i = 0; i < nr_segments; i++) {
 987                         result = kimage_load_segment(image, &image->segment[i]);
 988                         if (result)
 989                                 goto out;
 990                 }
 991                 result = kimage_terminate(image);
 992                 if (result)
 993                         goto out;
 994         }
 995         /* Install the new kernel, and  Uninstall the old */
 996         image = xchg(dest_image, image);
 997
 998 out:
 999         locked = xchg(&kexec_lock, 0); /* Release the mutex */
1000         BUG_ON(!locked);
1001         kimage_free(image);
1002
1003         return result;
1004 }
1005
1006 #ifdef CONFIG_COMPAT
1007 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1008                                 unsigned long nr_segments,
1009                                 struct compat_kexec_segment __user *segments,
1010                                 unsigned long flags)
1011 {
1012         struct compat_kexec_segment in;
1013         struct kexec_segment out, __user *ksegments;
1014         unsigned long i, result;
1015
1016         /* Don't allow clients that don't understand the native
1017          * architecture to do anything.
1018          */
1019         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1020                 return -EINVAL;
1021
1022         if (nr_segments > KEXEC_SEGMENT_MAX)
1023                 return -EINVAL;
1024
1025         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1026         for (i=0; i < nr_segments; i++) {
1027                 result = copy_from_user(&in, &segments[i], sizeof(in));
1028                 if (result)
1029                         return -EFAULT;
1030
1031                 out.buf   = compat_ptr(in.buf);
1032                 out.bufsz = in.bufsz;
1033                 out.mem   = in.mem;
1034                 out.memsz = in.memsz;
1035
1036                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1037                 if (result)
1038                         return -EFAULT;
1039         }
1040
1041         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1042 }
1043 #endif
1044
1045 void crash_kexec(struct pt_regs *regs)
1046 {
1047         int locked;
1048
1049
1050         /* Take the kexec_lock here to prevent sys_kexec_load
1051          * running on one cpu from replacing the crash kernel
1052          * we are using after a panic on a different cpu.
1053          *
1054          * If the crash kernel was not located in a fixed area
1055          * of memory the xchg(&kexec_crash_image) would be
1056          * sufficient.  But since I reuse the memory...
1057          */
1058         locked = xchg(&kexec_lock, 1);
1059         if (!locked) {
1060                 if (kexec_crash_image) {
1061                         struct pt_regs fixed_regs;
1062                         crash_setup_regs(&fixed_regs, regs);
1063                         machine_crash_shutdown(&fixed_regs);
1064                         machine_kexec(kexec_crash_image);
1065                 }
1066                 locked = xchg(&kexec_lock, 0);
1067                 BUG_ON(!locked);
1068         }
1069 }
1070
1071 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1072                             size_t data_len)
1073 {
1074         struct elf_note note;
1075
1076         note.n_namesz = strlen(name) + 1;
1077         note.n_descsz = data_len;
1078         note.n_type   = type;
1079         memcpy(buf, &note, sizeof(note));
1080         buf += (sizeof(note) + 3)/4;
1081         memcpy(buf, name, note.n_namesz);
1082         buf += (note.n_namesz + 3)/4;
1083         memcpy(buf, data, note.n_descsz);
1084         buf += (note.n_descsz + 3)/4;
1085
1086         return buf;
1087 }
1088
1089 static void final_note(u32 *buf)
1090 {
1091         struct elf_note note;
1092
1093         note.n_namesz = 0;
1094         note.n_descsz = 0;
1095         note.n_type   = 0;
1096         memcpy(buf, &note, sizeof(note));
1097 }
1098
1099 void crash_save_cpu(struct pt_regs *regs, int cpu)
1100 {
1101         struct elf_prstatus prstatus;
1102         u32 *buf;
1103
1104         if ((cpu < 0) || (cpu >= NR_CPUS))
1105                 return;
1106
1107         /* Using ELF notes here is opportunistic.
1108          * I need a well defined structure format
1109          * for the data I pass, and I need tags
1110          * on the data to indicate what information I have
1111          * squirrelled away.  ELF notes happen to provide
1112          * all of that, so there is no need to invent something new.
1113          */
1114         buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1115         if (!buf)
1116                 return;
1117         memset(&prstatus, 0, sizeof(prstatus));
1118         prstatus.pr_pid = current->pid;
1119         elf_core_copy_regs(&prstatus.pr_reg, regs);
1120         buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1121                                 sizeof(prstatus));
1122         final_note(buf);
1123 }
1124
1125 static int __init crash_notes_memory_init(void)
1126 {
1127         /* Allocate memory for saving cpu registers. */
1128         crash_notes = alloc_percpu(note_buf_t);
1129         if (!crash_notes) {
1130                 printk("Kexec: Memory allocation for saving cpu register"
1131                 " states failed\n");
1132                 return -ENOMEM;
1133         }
1134         return 0;
1135 }
1136 module_init(crash_notes_memory_init)