struct tss_struct;
struct mm_struct;
struct desc_struct;
+struct task_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
void (*swapgs)(void);
- struct pv_lazy_ops lazy_mode;
+ void (*start_context_switch)(struct task_struct *prev);
+ void (*end_context_switch)(struct task_struct *next);
};
struct pv_irq_ops {
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
- void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte);
void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
void (*pmd_clear)(pmd_t *pmdp);
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "m" (op)
+ [paravirt_opptr] "i" (&(op))
#define paravirt_clobber(clobber) \
[paravirt_clobber] "i" (clobber)
* offset into the paravirt_patch_template structure, and can therefore be
* freely converted back into a structure offset.
*/
- #define PARAVIRT_CALL "call *%[paravirt_opptr];"
+ #define PARAVIRT_CALL "call *%c[paravirt_opptr];"
/*
* These macros are intended to wrap calls through one of the paravirt
pte.pte, pte.pte >> 32);
}
- static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
- {
- /* 5 arg words */
- pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
- }
-
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
set_pte(ptep, pte);
}
- static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
- {
- set_pte(ptep, pte);
- }
-
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
};
enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
-#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+ PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
}
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(struct task_struct *next)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+ PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
}
-void arch_flush_lazy_cpu_mode(void);
-
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
- #define set_pte_present(mm, addr, ptep, pte) \
- native_set_pte_present(mm, addr, ptep, pte)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
#define pte_val(x) native_pte_val(x)
#define __pte(x) native_make_pte(x)
+#define arch_end_context_switch(prev) do {} while(0)
+
#endif /* CONFIG_PARAVIRT */
/*
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
-#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
+#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
++#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
+ #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
_TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
/* work to do in syscall_trace_leave() */
#define _TIF_WORK_SYSCALL_EXIT \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
+ _TIF_SYSCALL_FTRACE)
/* work to do on interrupt/exception return */
#define _TIF_WORK_MASK \
_TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
/* work to do on any return to user space */
- #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+ #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
kvm_mmu_write(ptep, pte_val(pte));
}
- static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
- {
- kvm_mmu_write(ptep, pte_val(pte));
- }
-
static void kvm_pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
struct kvm_para_state *state = kvm_para_state();
mmu_queue_flush(state);
- paravirt_leave_lazy(paravirt_get_lazy_mode());
+ paravirt_leave_lazy_mmu();
state->mode = paravirt_get_lazy_mode();
}
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.set_pte_present = kvm_set_pte_present;
pv_mmu_ops.pte_clear = kvm_pte_clear;
pv_mmu_ops.pmd_clear = kvm_pmd_clear;
#endif
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- __get_cpu_var(paravirt_lazy_mode) = mode;
+ percpu_write(paravirt_lazy_mode, mode);
}
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
- __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+ percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
void paravirt_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+ leave_lazy(PARAVIRT_LAZY_MMU);
}
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
{
+ BUG_ON(preemptible());
+
+ if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
enter_lazy(PARAVIRT_LAZY_CPU);
}
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
{
- return __get_cpu_var(paravirt_lazy_mode);
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return percpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
preempt_disable();
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- WARN_ON(preempt_count() == 1);
arch_leave_lazy_mmu_mode();
arch_enter_lazy_mmu_mode();
}
preempt_enable();
}
-void arch_flush_lazy_cpu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
- WARN_ON(preempt_count() == 1);
- arch_leave_lazy_cpu_mode();
- arch_enter_lazy_cpu_mode();
- }
-
- preempt_enable();
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- },
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
};
struct pv_apic_ops pv_apic_ops = {
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
- .set_pte_present = native_set_pte_present,
.pte_clear = native_pte_clear,
.pmd_clear = native_pmd_clear,
#endif
unlazy_fpu(tsk);
}
- int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
{
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
unlazy_fpu(tsk);
}
- int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
{
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/*
* Switch FS and GS.
vmi_ops.update_pte(ptep, VMI_PAGE_PT);
}
- static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
- {
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
- }
-
static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
}
#endif
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
{
- paravirt_enter_lazy_cpu();
+ paravirt_start_context_switch(prev);
vmi_ops.set_lazy_mode(2);
}
+static void vmi_end_context_switch(struct task_struct *next)
+{
+ vmi_ops.set_lazy_mode(0);
+ paravirt_end_context_switch(next);
+}
+
static void vmi_enter_lazy_mmu(void)
{
paravirt_enter_lazy_mmu();
vmi_ops.set_lazy_mode(1);
}
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
vmi_ops.set_lazy_mode(0);
+ paravirt_leave_lazy_mmu();
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
para_fill(pv_cpu_ops.io_delay, IODelay);
- para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+ para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
set_lazy_mode, SetLazyMode);
para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
pv_mmu_ops.set_pmd = vmi_set_pmd;
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pte_present = vmi_set_pte_present;
pv_mmu_ops.set_pud = vmi_set_pud;
pv_mmu_ops.pte_clear = vmi_pte_clear;
pv_mmu_ops.pmd_clear = vmi_pmd_clear;
local_irq_save(flags);
if (lguest_data.hcall_status[next_call] != 0xFF) {
/* Table full, so do normal hcall which will flush table. */
- hcall(call, arg1, arg2, arg3);
+ kvm_hypercall3(call, arg1, arg2, arg3);
} else {
lguest_data.hcalls[next_call].arg0 = call;
lguest_data.hcalls[next_call].arg1 = arg1;
*
* So, when we're in lazy mode, we call async_hcall() to store the call for
* future processing: */
- static void lazy_hcall(unsigned long call,
+ static void lazy_hcall1(unsigned long call,
+ unsigned long arg1)
+ {
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+ kvm_hypercall1(call, arg1);
+ else
+ async_hcall(call, arg1, 0, 0);
+ }
+
+ static void lazy_hcall2(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2)
+ {
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+ kvm_hypercall2(call, arg1, arg2);
+ else
+ async_hcall(call, arg1, arg2, 0);
+ }
+
+ static void lazy_hcall3(unsigned long call,
unsigned long arg1,
unsigned long arg2,
unsigned long arg3)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3);
+ kvm_hypercall3(call, arg1, arg2, arg3);
else
async_hcall(call, arg1, arg2, arg3);
}
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
* issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
+{
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+ paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
- paravirt_leave_lazy(paravirt_get_lazy_mode());
+ kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_end_context_switch(next);
}
/*G:033
/* Keep the local copy up to date. */
native_write_idt_entry(dt, entrynum, g);
/* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
+ kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
}
/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
struct desc_struct *idt = (void *)desc->address;
for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+ kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
}
/*
*/
static void lguest_load_gdt(const struct desc_ptr *desc)
{
- BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
- hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+ BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
+ kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
}
/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
const void *desc, int type)
{
native_write_gdt_entry(dt, entrynum, desc, type);
- hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+ kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
}
/* OK, I lied. There are three "thread local storage" GDT entries which change
* can't handle us removing entries we're currently using. So we clear
* the GS register here: if it's needed it'll be reloaded anyway. */
lazy_load_gs(0);
- lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+ lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
}
/*G:038 That's enough excitement for now, back to ploughing through each of
static unsigned long current_cr0;
static void lguest_write_cr0(unsigned long val)
{
- lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
+ lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
current_cr0 = val;
}
* the vowels have been optimized out. */
static void lguest_clts(void)
{
- lazy_hcall(LHCALL_TS, 0, 0, 0);
+ lazy_hcall1(LHCALL_TS, 0);
current_cr0 &= ~X86_CR0_TS;
}
static void lguest_write_cr3(unsigned long cr3)
{
lguest_data.pgdir = cr3;
- lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+ lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
cr3_changed = true;
}
* into a process' address space. We set the entry then tell the Host the
* toplevel and address this corresponds to. The Guest uses one pagetable per
* process, so we need to tell the Host which one we're changing (mm->pgd). */
+ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+ {
+ lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+ }
+
static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
*ptep = pteval;
- lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+ lguest_pte_update(mm, addr, ptep);
}
/* The Guest calls this to set a top-level entry. Again, we set the entry then
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
*pmdp = pmdval;
- lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
- (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+ lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
}
/* There are a couple of legacy places where the kernel sets a PTE, but we
{
*ptep = pteval;
if (cr3_changed)
- lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+ lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
}
/* This is what happens after the Guest has removed a large number of entries.
* have changed, ie. virtual addresses below PAGE_OFFSET. */
static void lguest_flush_tlb_user(void)
{
- lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+ lazy_hcall1(LHCALL_FLUSH_TLB, 0);
}
/* This is called when the kernel page tables have changed. That's not very
* slow), so it's worth separating this from the user flushing above. */
static void lguest_flush_tlb_kernel(void)
{
- lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+ lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
/*
}
/* Please wake us this far in the future. */
- hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+ kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
return 0;
}
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
/* A 0 argument shuts the clock down. */
- hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+ kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
break;
case CLOCK_EVT_MODE_ONESHOT:
/* This is what we expect. */
static void lguest_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
- lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
- THREAD_SIZE/PAGE_SIZE);
+ lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
+ THREAD_SIZE / PAGE_SIZE);
}
/* Let's just say, I wouldn't do debugging under a Guest. */
/* STOP! Until an interrupt comes in. */
static void lguest_safe_halt(void)
{
- hcall(LHCALL_HALT, 0, 0, 0);
+ kvm_hypercall0(LHCALL_HALT);
}
/* The SHUTDOWN hypercall takes a string to describe what's happening, and
* rather than virtual addresses, so we use __pa() here. */
static void lguest_power_off(void)
{
- hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
+ kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
+ LGUEST_SHUTDOWN_POWEROFF);
}
/*
*/
static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
{
- hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
+ kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
/* The hcall won't return, but to keep gcc happy, we're "done". */
return NOTIFY_DONE;
}
len = sizeof(scratch) - 1;
scratch[len] = '\0';
memcpy(scratch, buf, len);
- hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
+ kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
/* This routine returns the number of bytes actually written. */
return len;
* Launcher to reboot us. */
static void lguest_restart(char *reason)
{
- hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+ kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
}
/*G:050
pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
- pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+ pv_cpu_ops.end_context_switch = lguest_end_context_switch;
/* pagetable management */
pv_mmu_ops.write_cr3 = lguest_write_cr3;
pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
+ pv_mmu_ops.pte_update = lguest_pte_update;
+ pv_mmu_ops.pte_update_defer = lguest_pte_update;
#ifdef CONFIG_X86_LOCAL_APIC
/* apic read/write intercepts */
* lguest_init() where the rest of the fairly chaotic boot setup
* occurs. */
- /* The native boot code sets up initial page tables immediately after
- * the kernel itself, and sets init_pg_tables_end so they're not
- * clobbered. The Launcher places our initial pagetables somewhere at
- * the top of our physical memory, so we don't need extra space: set
- * init_pg_tables_end to the end of the kernel. */
- init_pg_tables_start = __pa(pg0);
- init_pg_tables_end = __pa(pg0);
-
/* As described in head_32.S, we map the first 128M of memory. */
max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
kunmap_high(page);
}
- static void debug_kmap_atomic_prot(enum km_type type)
- {
- #ifdef CONFIG_DEBUG_HIGHMEM
- static unsigned warn_count = 10;
-
- if (unlikely(warn_count == 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
- #endif
- }
-
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
if (!PageHighMem(page))
return page_address(page);
- debug_kmap_atomic_prot(type);
+ debug_kmap_atomic(type);
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
- arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
#endif
}
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
- /* This is the same as kmap_atomic() but can map memory that doesn't
+ /*
+ * This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
- enum fixed_addresses idx;
- unsigned long vaddr;
-
- pagefault_disable();
-
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
- #ifdef CONFIG_NUMA
void __init set_highmem_pages_init(void)
{
struct zone *zone;
}
totalram_pages += totalhigh_pages;
}
- #else
- void __init set_highmem_pages_init(void)
- {
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- totalram_pages += totalhigh_pages;
- }
- #endif /* CONFIG_NUMA */
#include <asm/iomap.h>
#include <asm/pat.h>
#include <linux/module.h>
+ #include <linux/highmem.h>
int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
- #ifndef CONFIG_X86_PAE
+ #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
if (base + size > 0x100000000ULL)
return 0;
}
EXPORT_SYMBOL_GPL(is_io_mapping_possible);
- /* Map 'pfn' using fixed map 'type' and protections 'prot'
- */
- void *
- iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
+ debug_kmap_atomic(type);
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+ }
+
+ /*
+ * Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+ void *
+ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+ {
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+ #include <asm/setup.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
unsigned long pfn;
unsigned force_split : 1;
int curpage;
+ struct page **pages;
};
/*
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
+ #define CPA_PAGES_ARRAY 4
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
}
}
- static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+ int in_flags, struct page **pages)
{
unsigned int i, level;
- unsigned long *addr;
BUG_ON(irqs_disabled());
* will cause all other CPUs to flush the same
* cachelines:
*/
- for (i = 0, addr = start; i < numpages; i++, addr++) {
- pte_t *pte = lookup_address(*addr, &level);
+ for (i = 0; i < numpages; i++) {
+ unsigned long addr;
+ pte_t *pte;
+
+ if (in_flags & CPA_PAGES_ARRAY)
+ addr = (unsigned long)page_address(pages[i]);
+ else
+ addr = start[i];
+
+ pte = lookup_address(addr, &level);
/*
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range((void *) *addr, PAGE_SIZE);
+ clflush_cache_range((void *)addr, PAGE_SIZE);
}
}
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
alias_cpa = *cpa;
temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
ret = __change_page_attr_set_clr(&alias_cpa, 0);
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, _brk_end))
return 0;
/*
alias_cpa = *cpa;
temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
/*
* The high mapping range is imprecise, so ignore the return value.
*/
cpa->numpages = numpages;
/* for array changes, we can't use large page */
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
if (!debug_pagealloc)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
cpa->curpage++;
else
*cpa->vaddr += cpa->numpages * PAGE_SIZE;
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split, int array)
+ int force_split, int in_flag,
+ struct page **pages)
{
struct cpa_data cpa;
int ret, cache, checkalias;
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (!array) {
- if (*addr & ~PAGE_MASK) {
- *addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
- }
- } else {
+ if (in_flag & CPA_ARRAY) {
int i;
for (i = 0; i < numpages; i++) {
if (addr[i] & ~PAGE_MASK) {
WARN_ON_ONCE(1);
}
}
+ } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+ /*
+ * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+ * No need to cehck in that case
+ */
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
}
/* Must avoid aliasing mappings in the highmem code */
vm_unmap_aliases();
- /*
- * If we're called with lazy mmu updates enabled, the
- * in-memory pte state may be stale. Flush pending updates to
- * bring them up to date.
- */
- arch_flush_lazy_mmu_mode();
-
cpa.vaddr = addr;
+ cpa.pages = pages;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
cpa.curpage = 0;
cpa.force_split = force_split;
- if (array)
- cpa.flags |= CPA_ARRAY;
+ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa.flags |= in_flag;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
* wbindv):
*/
if (!ret && cpu_has_clflush) {
- if (cpa.flags & CPA_ARRAY)
- cpa_flush_array(addr, numpages, cache);
- else
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(addr, numpages, cache,
+ cpa.flags, pages);
+ } else
cpa_flush_range(*addr, numpages, cache);
} else
cpa_flush_all(cache);
- /*
- * If we've been called with lazy mmu updates enabled, then
- * make sure that everything gets flushed out before we
- * return.
- */
- arch_flush_lazy_mmu_mode();
-
out:
return ret;
}
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
}
static inline int change_page_attr_clear(unsigned long *addr, int numpages,
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
+ }
+
+ static inline int cpa_set_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+ {
+ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+ CPA_PAGES_ARRAY, pages);
+ }
+
+ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+ {
+ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+ CPA_PAGES_ARRAY, pages);
}
int _set_memory_uc(unsigned long addr, int numpages)
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
- __pgprot(0), 1, 0);
+ __pgprot(0), 1, 0, NULL);
}
int set_pages_uc(struct page *page, int numpages)
}
EXPORT_SYMBOL(set_pages_uc);
+ int set_pages_array_uc(struct page **pages, int addrinarray)
+ {
+ unsigned long start;
+ unsigned long end;
+ int i;
+ int free_idx;
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto err_out;
+ }
+
+ if (cpa_set_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+ return 0; /* Success */
+ }
+ err_out:
+ free_idx = i;
+ for (i = 0; i < free_idx; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+ return -EINVAL;
+ }
+ EXPORT_SYMBOL(set_pages_array_uc);
+
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
}
EXPORT_SYMBOL(set_pages_wb);
+ int set_pages_array_wb(struct page **pages, int addrinarray)
+ {
+ int retval;
+ unsigned long start;
+ unsigned long end;
+ int i;
+
+ retval = cpa_clear_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK));
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+
+ return retval;
+ }
+ EXPORT_SYMBOL(set_pages_array_wb);
+
int set_pages_x(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
}
/* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
+static void __init xen_build_mfn_list_list(void)
{
unsigned pfn, idx;
unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
}
+}
+void xen_setup_mfn_list_list(void)
+{
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
p2m_top[topidx] = &mfn_list[pfn];
}
+
+ xen_build_mfn_list_list();
}
unsigned long get_phys_to_machine(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+/* install a new p2m_top page */
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
{
- unsigned long *p;
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned long **pfnp, *mfnp;
unsigned i;
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
+ pfnp = &p2m_top[topidx];
+ mfnp = &p2m_top_mfn[topidx];
for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
p[i] = INVALID_P2M_ENTRY;
- if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
- free_page((unsigned long)p);
- else
+ if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
*mfnp = virt_to_mfn(p);
+ return true;
+ }
+
+ return false;
}
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+static void alloc_p2m(unsigned long pfn)
{
- unsigned topidx, idx;
+ unsigned long *p;
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return;
- }
+ p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+ BUG_ON(p == NULL);
+
+ if (!install_p2mtop_page(pfn, p))
+ free_page((unsigned long)p);
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, idx;
if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
- return;
+ return true;
}
topidx = p2m_top_index(pfn);
if (p2m_top[topidx] == p2m_missing) {
- /* no need to allocate a page to store an invalid entry */
if (mfn == INVALID_P2M_ENTRY)
- return;
- alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+ return true;
+ return false;
}
idx = p2m_index(pfn);
p2m_top[topidx][idx] = mfn;
+
+ return true;
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ alloc_p2m(pfn);
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ BUG();
+ }
}
unsigned long arbitrary_virt_to_mfn(void *vaddr)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- /* updates to init_mm may be done without lock */
- if (mm == &init_mm)
- preempt_disable();
-
ADD_STATS(set_pte_at, 1);
// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
ADD_STATS(set_pte_at_current, mm == current->mm);
}
xen_set_pte(ptep, pteval);
-out:
- if (mm == &init_mm)
- preempt_enable();
+out: return;
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
return 0;
}
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
{
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
- arch_flush_lazy_cpu_mode();
- }
}
static void xen_drop_mm_ref(struct mm_struct *mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
- arch_flush_lazy_cpu_mode();
}
/* Get the "official" set of cpus referring to our pagetable. */
} *args;
struct multicall_space mcs;
- BUG_ON(cpumask_empty(cpus));
- BUG_ON(!mm);
+ if (cpumask_empty(cpus))
+ return; /* nothing to do */
mcs = xen_mc_entry(sizeof(*args));
args = mcs.args;
}
#endif
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
#endif
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static __init void xen_release_pte_init(unsigned long pfn)
{
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static __init void xen_release_pmd_init(unsigned long pfn)
{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
/* This needs to make sure the new pte page is pinned iff its being
{
pmd_t *kernel_pmd;
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
xen_mark_init_mm_pinned();
}
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
.alloc_pmd_clone = paravirt_nop,
- .release_pmd = xen_release_pte_init,
+ .release_pmd = xen_release_pmd_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
#endif /* CONFIG_X86_PAE */
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
- .alloc_pud = xen_alloc_pte_init,
- .release_pud = xen_release_pte_init,
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
#endif /* PAGETABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
+ .leave = xen_leave_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
- cpu_set(i, cpu_possible_map);
+ set_cpu_possible(i, true);
}
}
}
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
- cpu_clear(cpu, cpu_possible_map);
+ set_cpu_possible(cpu, false);
}
for_each_possible_cpu (cpu) {
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
- cpu_set(cpu, cpu_present_map);
+ set_cpu_present(cpu, true);
}
}
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
secure, but slightly less efficient.
If in doubt, say yes.
+config XEN_DEV_EVTCHN
+ tristate "Xen /dev/xen/evtchn device"
+ depends on XEN
+ default y
+ help
+ The evtchn driver allows a userspace process to triger event
+ channels and to receive notification of an event channel
+ firing.
+ If in doubt, say yes.
+
config XENFS
tristate "Xen filesystem"
depends on XEN
The old xenstore userspace tools expect to find "xenbus"
under /proc/xen, but "xenbus" is now found at the root of the
xenfs filesystem. Selecting this causes the kernel to create
- the compatibilty mount point /proc/xen if it is running on
+ the compatibility mount point /proc/xen if it is running on
a xen platform.
If in doubt, say yes.
+config XEN_SYS_HYPERVISOR
+ bool "Create xen entries under /sys/hypervisor"
+ depends on XEN && SYSFS
+ select SYS_HYPERVISOR
+ default y
+ help
+ Create entries under /sys/hypervisor describing the Xen
+ hypervisor environment. When running native or in another
+ virtual environment, /sys/hypervisor will still be present,
+ but will have no xen contents.
if (!cpu_present(cpu))
arch_register_cpu(cpu);
- cpu_set(cpu, cpu_present_map);
+ set_cpu_present(cpu, true);
}
static void disable_hotplug_cpu(int cpu)
if (cpu_present(cpu))
arch_unregister_cpu(cpu);
- cpu_clear(cpu, cpu_present_map);
+ set_cpu_present(cpu, false);
}
-static void vcpu_hotplug(unsigned int cpu)
+static int vcpu_online(unsigned int cpu)
{
int err;
char dir[32], state[32];
- if (!cpu_possible(cpu))
- return;
-
sprintf(dir, "cpu/%u", cpu);
err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
if (err != 1) {
printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
- return;
+ return err;
}
- if (strcmp(state, "online") == 0) {
+ if (strcmp(state, "online") == 0)
+ return 1;
+ else if (strcmp(state, "offline") == 0)
+ return 0;
+
+ printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+ return -EINVAL;
+}
+static void vcpu_hotplug(unsigned int cpu)
+{
+ if (!cpu_possible(cpu))
+ return;
+
+ switch (vcpu_online(cpu)) {
+ case 1:
enable_hotplug_cpu(cpu);
- } else if (strcmp(state, "offline") == 0) {
+ break;
+ case 0:
(void)cpu_down(cpu);
disable_hotplug_cpu(cpu);
- } else {
- printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
- state, cpu);
+ break;
+ default:
+ break;
}
}
static int setup_cpu_watcher(struct notifier_block *notifier,
unsigned long event, void *data)
{
+ int cpu;
static struct xenbus_watch cpu_watch = {
.node = "cpu",
.callback = handle_vcpu_hotplug_event};
(void)register_xenbus_watch(&cpu_watch);
+ for_each_possible_cpu(cpu) {
+ if (vcpu_online(cpu) == 0) {
+ (void)cpu_down(cpu);
+ cpu_clear(cpu, cpu_present_map);
+ }
+ }
+
return NOTIFY_DONE;
}
BUG_ON(!irqs_disabled());
- err = device_power_down(PMSG_SUSPEND);
- if (err) {
- printk(KERN_ERR "xen_suspend: device_power_down failed: %d\n",
- err);
- return err;
- }
err = sysdev_suspend(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
gnttab_resume();
xen_mm_unpin_all();
- sysdev_resume();
-
if (!*cancelled) {
xen_irq_resume();
xen_console_resume();
xen_timer_resume();
}
+ sysdev_resume();
+ device_power_up(PMSG_RESUME);
+
return 0;
}
goto out;
}
- printk("suspending xenbus...\n");
- /* XXX use normal device tree? */
- xenbus_suspend();
+ printk(KERN_DEBUG "suspending xenstore...\n");
+ xs_suspend();
+ err = device_power_down(PMSG_SUSPEND);
+ if (err) {
+ printk(KERN_ERR "device_power_down failed: %d\n", err);
+ goto resume_devices;
+ }
+
err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
if (!cancelled) {
xen_arch_resume();
- xenbus_resume();
+ xs_resume();
} else
- xenbus_suspend_cancel();
+ xs_suspend_cancel();
+ device_power_up(PMSG_RESUME);
+
+ resume_devices:
device_resume(PMSG_RESUME);
/* Make sure timer events get retriggered on all CPUs */
spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
+ unsigned long delta;
+ ktime_t soft, hard;
+
if (hrtimer_active(&rt_b->rt_period_timer))
break;
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start_expires(&rt_b->rt_period_timer,
- HRTIMER_MODE_ABS);
+
+ soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+ hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+ HRTIMER_MODE_ABS, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+ __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+ HRTIMER_MODE_REL, 0);
}
static inline void init_hrtick(void)
* combine the page table reload and the switch backend into
* one hypercall.
*/
- arch_enter_lazy_cpu_mode();
+ arch_start_context_switch(prev);
if (unlikely(!mm)) {
next->active_mm = oldmm;
*/
#define MAX_PINNED_INTERVAL 512
+ /* Working cpumask for load_balance and load_balance_newidle. */
+ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance, struct cpumask *cpus)
+ int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
* this_rq is locked.
*/
static int
- load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- struct cpumask *cpus)
+ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
struct rq *busiest = NULL;
int ld_moved = 0;
int sd_idle = 0;
int all_pinned = 0;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_var_t tmpmask;
-
- if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
- return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd, tmpmask);
+ sd);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
*/
this_rq->next_balance = next_balance;
}
- free_cpumask_var(tmpmask);
}
/*
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_var_t tmp;
-
- /* Fails alloc? Rebalancing probably not a priority right now. */
- if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
- return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
-
- free_cpumask_var(tmp);
}
/*
#endif
}
- #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
-
- static inline unsigned long get_parent_ip(unsigned long addr)
+ unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
return addr;
}
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
* schedule() is the main scheduler function.
*/
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
- need_resched:
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
+ }
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+ preempt_disable();
+ __schedule();
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
EXPORT_SYMBOL(schedule);
+ #ifdef CONFIG_SMP
+ /*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+ unsigned int cpu;
+ struct rq *rq;
+
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
+
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * Need to access the cpu field knowing that
+ * DEBUG_PAGEALLOC could have unmapped it if
+ * the mutex owner just released it and exited.
+ */
+ if (probe_kernel_address(&owner->cpu, cpu))
+ goto out;
+ #else
+ cpu = owner->cpu;
+ #endif
+
+ /*
+ * Even if the access succeeded (likely case),
+ * the cpu field may no longer be valid.
+ */
+ if (cpu >= nr_cpumask_bits)
+ goto out;
+
+ /*
+ * We need to validate that we can do a
+ * get_cpu() and that we have the percpu area.
+ */
+ if (!cpu_online(cpu))
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ /*
+ * Owner changed, break to re-assess state.
+ */
+ if (lock->owner != owner)
+ break;
+
+ /*
+ * Is that owner really running on that cpu?
+ */
+ if (task_thread_info(rq->curr) != owner || need_resched())
+ return 0;
+
+ cpu_relax();
+ }
+ out:
+ return 1;
+ }
+ #endif
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
__wake_up_common(q, mode, 1, 0, NULL);
}
+ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+ {
+ __wake_up_common(q, mode, 1, 0, key);
+ }
+
/**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
*
* On UP it can prevent extra preemption.
*/
- void
- __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
{
unsigned long flags;
int sync = 1;
sync = 0;
spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ __wake_up_common(q, mode, nr_exclusive, sync, key);
spin_unlock_irqrestore(&q->lock, flags);
}
+ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+ /*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+ {
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ }
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/**
{
int group;
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd),
- &per_cpu(cpu_sibling_map, i), cpu_map);
+ topology_thread_cpumask(i), cpu_map);
sd->parent = p;
p->child = sd;
cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
/* Set up CPU (sibling) groups */
for_each_cpu(i, cpu_map) {
cpumask_and(this_sibling_map,
- &per_cpu(cpu_sibling_map, i), cpu_map);
+ topology_thread_cpumask(i), cpu_map);
if (i != cpumask_first(this_sibling_map))
continue;
#endif
#ifdef CONFIG_USER_SCHED
alloc_size *= 2;
+ #endif
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ alloc_size += num_possible_cpus() * cpumask_size();
#endif
/*
* As sched_init() is called before page_alloc is setup,
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_RT_GROUP_SCHED */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ ptr += cpumask_size();
+ }
+ #endif /* CONFIG_CPUMASK_OFFSTACK */
}
#ifdef CONFIG_SMP