Merge tag 'powerpc-4.20-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:19:35 +0000 (09:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:19:35 +0000 (09:19 -0700)
Pull powerpc fixes from Michael Ellerman:
 "Some things that I missed due to travel, or that came in late.

  Two fixes also going to stable:

   - A revert of a buggy change to the 8xx TLB miss handlers.

   - Our flushing of SPE (Signal Processing Engine) registers on fork
     was broken.

  Other changes:

   - A change to the KVM decrementer emulation to use proper APIs.

   - Some cleanups to the way we do code patching in the 8xx code.

   - Expose the maximum possible memory for the system in
     /proc/powerpc/lparcfg.

   - Merge some updates from Scott: "a couple device tree updates, and a
     fix for a missing prototype warning"

  A few other minor fixes and a handful of fixes for our selftests.

  Thanks to: Aravinda Prasad, Breno Leitao, Camelia Groza, Christophe
  Leroy, Felipe Rechia, Joel Stanley, Naveen N. Rao, Paul Mackerras,
  Scott Wood, Tyrel Datwyler"

* tag 'powerpc-4.20-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (21 commits)
  selftests/powerpc: Fix compilation issue due to asm label
  selftests/powerpc/cache_shape: Fix out-of-tree build
  selftests/powerpc/switch_endian: Fix out-of-tree build
  selftests/powerpc/pmu: Link ebb tests with -no-pie
  selftests/powerpc/signal: Fix out-of-tree build
  selftests/powerpc/ptrace: Fix out-of-tree build
  powerpc/xmon: Relax frame size for clang
  selftests: powerpc: Fix warning for security subdir
  selftests/powerpc: Relax L1d miss targets for rfi_flush test
  powerpc/process: Fix flush_all_to_thread for SPE
  powerpc/pseries: add missing cpumask.h include file
  selftests/powerpc: Fix ptrace tm failure
  KVM: PPC: Use exported tb_to_ns() function in decrementer emulation
  powerpc/pseries: Export maximum memory value
  powerpc/8xx: Use patch_site for perf counters setup
  powerpc/8xx: Use patch_site for memory setup patching
  powerpc/code-patching: Add a helper to get the address of a patch_site
  Revert "powerpc/8xx: Use L1 entry APG to handle _PAGE_ACCESSED for CONFIG_SWAP"
  powerpc/8xx: add missing header in 8xx_mmu.c
  powerpc/8xx: Add DT node for using the SEC engine of the MPC885
  ...

1  2 
arch/powerpc/kernel/process.c
arch/powerpc/kvm/book3s_hv.c

index 4d5322cfad25c7b543cc2e80c1f355f2d75030fc,bcb36229d4fdc3858d3a842e23dc12afca259e51..96f34730010fe3f5f778400a14a7a470d4d38142
@@@ -590,12 -590,11 +590,11 @@@ void flush_all_to_thread(struct task_st
        if (tsk->thread.regs) {
                preempt_disable();
                BUG_ON(tsk != current);
-               save_all(tsk);
  #ifdef CONFIG_SPE
                if (tsk->thread.regs->msr & MSR_SPE)
                        tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
  #endif
+               save_all(tsk);
  
                preempt_enable();
        }
@@@ -619,6 -618,8 +618,6 @@@ void do_send_trap(struct pt_regs *regs
  void do_break (struct pt_regs *regs, unsigned long address,
                    unsigned long error_code)
  {
 -      siginfo_t info;
 -
        current->thread.trap_nr = TRAP_HWBKPT;
        if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
                        11, SIGSEGV) == NOTIFY_STOP)
        hw_breakpoint_disable();
  
        /* Deliver the signal to userspace */
 -      clear_siginfo(&info);
 -      info.si_signo = SIGTRAP;
 -      info.si_errno = 0;
 -      info.si_code = TRAP_HWBKPT;
 -      info.si_addr = (void __user *)address;
 -      force_sig_info(SIGTRAP, &info, current);
 +      force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address, current);
  }
  #endif        /* CONFIG_PPC_ADV_DEBUG_REGS */
  
index bf8def2159c31e3e921394464e1491a5097f23b4,a6d948b6425bd23e367e544a809781c56155736d..d65b961661fbf6d9075b34c523269bb1261845fe
@@@ -50,7 -50,6 +50,7 @@@
  #include <asm/reg.h>
  #include <asm/ppc-opcode.h>
  #include <asm/asm-prototypes.h>
 +#include <asm/archrandom.h>
  #include <asm/debug.h>
  #include <asm/disassemble.h>
  #include <asm/cputable.h>
@@@ -105,10 -104,6 +105,10 @@@ static bool indep_threads_mode = true
  module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
  MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
  
 +static bool one_vm_per_core;
 +module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
 +
  #ifdef CONFIG_KVM_XICS
  static struct kernel_param_ops module_param_ops = {
        .set = param_set_int,
@@@ -122,16 -117,6 +122,16 @@@ module_param_cb(h_ipi_redirect, &module
  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
  #endif
  
 +/* If set, guests are allowed to create and control nested guests */
 +static bool nested = true;
 +module_param(nested, bool, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
 +
 +static inline bool nesting_enabled(struct kvm *kvm)
 +{
 +      return kvm->arch.nested_enable && kvm_is_radix(kvm);
 +}
 +
  /* If set, the threads on each CPU core have to be in the same MMU mode */
  static bool no_mixing_hpt_and_radix;
  
@@@ -188,10 -173,6 +188,10 @@@ static bool kvmppc_ipi_thread(int cpu
  {
        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
  
 +      /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
 +      if (kvmhv_on_pseries())
 +              return false;
 +
        /* On POWER9 we can use msgsnd to IPI any cpu */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
                msg |= get_hard_smp_processor_id(cpu);
@@@ -429,8 -410,8 +429,8 @@@ static void kvmppc_dump_regs(struct kvm
               vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
        pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
               vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
 -      pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
 -             vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
 +      pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
 +             vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
        pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
        pr_err("fault dar = %.16lx dsisr = %.8x\n",
               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
@@@ -749,7 -730,8 +749,7 @@@ static bool kvmppc_doorbell_pending(str
        /*
         * Ensure that the read of vcore->dpdes comes after the read
         * of vcpu->doorbell_request.  This barrier matches the
 -       * lwsync in book3s_hv_rmhandlers.S just before the
 -       * fast_guest_return label.
 +       * smb_wmb() in kvmppc_guest_entry_inject().
         */
        smp_rmb();
        vc = vcpu->arch.vcore;
@@@ -930,19 -912,6 +930,19 @@@ int kvmppc_pseries_do_hcall(struct kvm_
                        break;
                }
                return RESUME_HOST;
 +      case H_SET_DABR:
 +              ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
 +              break;
 +      case H_SET_XDABR:
 +              ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
 +                                              kvmppc_get_gpr(vcpu, 5));
 +              break;
 +      case H_GET_TCE:
 +              ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
 +                                              kvmppc_get_gpr(vcpu, 5));
 +              if (ret == H_TOO_HARD)
 +                      return RESUME_HOST;
 +              break;
        case H_PUT_TCE:
                ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
                                                kvmppc_get_gpr(vcpu, 5),
                if (ret == H_TOO_HARD)
                        return RESUME_HOST;
                break;
 +      case H_RANDOM:
 +              if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
 +                      ret = H_HARDWARE;
 +              break;
 +
 +      case H_SET_PARTITION_TABLE:
 +              ret = H_FUNCTION;
 +              if (nesting_enabled(vcpu->kvm))
 +                      ret = kvmhv_set_partition_table(vcpu);
 +              break;
 +      case H_ENTER_NESTED:
 +              ret = H_FUNCTION;
 +              if (!nesting_enabled(vcpu->kvm))
 +                      break;
 +              ret = kvmhv_enter_nested_guest(vcpu);
 +              if (ret == H_INTERRUPT) {
 +                      kvmppc_set_gpr(vcpu, 3, 0);
 +                      return -EINTR;
 +              }
 +              break;
 +      case H_TLB_INVALIDATE:
 +              ret = H_FUNCTION;
 +              if (nesting_enabled(vcpu->kvm))
 +                      ret = kvmhv_do_nested_tlbie(vcpu);
 +              break;
 +
        default:
                return RESUME_HOST;
        }
        return RESUME_GUEST;
  }
  
 +/*
 + * Handle H_CEDE in the nested virtualization case where we haven't
 + * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
 + * This has to be done early, not in kvmppc_pseries_do_hcall(), so
 + * that the cede logic in kvmppc_run_single_vcpu() works properly.
 + */
 +static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
 +{
 +      vcpu->arch.shregs.msr |= MSR_EE;
 +      vcpu->arch.ceded = 1;
 +      smp_mb();
 +      if (vcpu->arch.prodded) {
 +              vcpu->arch.prodded = 0;
 +              smp_mb();
 +              vcpu->arch.ceded = 0;
 +      }
 +}
 +
  static int kvmppc_hcall_impl_hv(unsigned long cmd)
  {
        switch (cmd) {
@@@ -1160,6 -1085,7 +1160,6 @@@ static int kvmppc_emulate_doorbell_inst
        return RESUME_GUEST;
  }
  
 -/* Called with vcpu->arch.vcore->lock held */
  static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                 struct task_struct *tsk)
  {
                break;
        case BOOK3S_INTERRUPT_H_INST_STORAGE:
                vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
 -              vcpu->arch.fault_dsisr = 0;
 +              vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
 +                      DSISR_SRR1_MATCH_64S;
 +              if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
 +                      vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
                r = RESUME_PAGE_FAULT;
                break;
        /*
                                swab32(vcpu->arch.emul_inst) :
                                vcpu->arch.emul_inst;
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
 -                      /* Need vcore unlocked to call kvmppc_get_last_inst */
 -                      spin_unlock(&vcpu->arch.vcore->lock);
                        r = kvmppc_emulate_debug_inst(run, vcpu);
 -                      spin_lock(&vcpu->arch.vcore->lock);
                } else {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
        case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
                r = EMULATE_FAIL;
                if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
 -                  cpu_has_feature(CPU_FTR_ARCH_300)) {
 -                      /* Need vcore unlocked to call kvmppc_get_last_inst */
 -                      spin_unlock(&vcpu->arch.vcore->lock);
 +                  cpu_has_feature(CPU_FTR_ARCH_300))
                        r = kvmppc_emulate_doorbell_instr(vcpu);
 -                      spin_lock(&vcpu->arch.vcore->lock);
 -              }
                if (r == EMULATE_FAIL) {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
        return r;
  }
  
 +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
 +{
 +      int r;
 +      int srcu_idx;
 +
 +      vcpu->stat.sum_exits++;
 +
 +      /*
 +       * This can happen if an interrupt occurs in the last stages
 +       * of guest entry or the first stages of guest exit (i.e. after
 +       * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
 +       * and before setting it to KVM_GUEST_MODE_HOST_HV).
 +       * That can happen due to a bug, or due to a machine check
 +       * occurring at just the wrong time.
 +       */
 +      if (vcpu->arch.shregs.msr & MSR_HV) {
 +              pr_emerg("KVM trap in HV mode while nested!\n");
 +              pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
 +                       vcpu->arch.trap, kvmppc_get_pc(vcpu),
 +                       vcpu->arch.shregs.msr);
 +              kvmppc_dump_regs(vcpu);
 +              return RESUME_HOST;
 +      }
 +      switch (vcpu->arch.trap) {
 +      /* We're good on these - the host merely wanted to get our attention */
 +      case BOOK3S_INTERRUPT_HV_DECREMENTER:
 +              vcpu->stat.dec_exits++;
 +              r = RESUME_GUEST;
 +              break;
 +      case BOOK3S_INTERRUPT_EXTERNAL:
 +              vcpu->stat.ext_intr_exits++;
 +              r = RESUME_HOST;
 +              break;
 +      case BOOK3S_INTERRUPT_H_DOORBELL:
 +      case BOOK3S_INTERRUPT_H_VIRT:
 +              vcpu->stat.ext_intr_exits++;
 +              r = RESUME_GUEST;
 +              break;
 +      /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
 +      case BOOK3S_INTERRUPT_HMI:
 +      case BOOK3S_INTERRUPT_PERFMON:
 +      case BOOK3S_INTERRUPT_SYSTEM_RESET:
 +              r = RESUME_GUEST;
 +              break;
 +      case BOOK3S_INTERRUPT_MACHINE_CHECK:
 +              /* Pass the machine check to the L1 guest */
 +              r = RESUME_HOST;
 +              /* Print the MCE event to host console. */
 +              machine_check_print_event_info(&vcpu->arch.mce_evt, false);
 +              break;
 +      /*
 +       * We get these next two if the guest accesses a page which it thinks
 +       * it has mapped but which is not actually present, either because
 +       * it is for an emulated I/O device or because the corresonding
 +       * host page has been paged out.
 +       */
 +      case BOOK3S_INTERRUPT_H_DATA_STORAGE:
 +              srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +              r = kvmhv_nested_page_fault(vcpu);
 +              srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
 +              break;
 +      case BOOK3S_INTERRUPT_H_INST_STORAGE:
 +              vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
 +              vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
 +                                       DSISR_SRR1_MATCH_64S;
 +              if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
 +                      vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
 +              srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 +              r = kvmhv_nested_page_fault(vcpu);
 +              srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
 +              break;
 +
 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 +      case BOOK3S_INTERRUPT_HV_SOFTPATCH:
 +              /*
 +               * This occurs for various TM-related instructions that
 +               * we need to emulate on POWER9 DD2.2.  We have already
 +               * handled the cases where the guest was in real-suspend
 +               * mode and was transitioning to transactional state.
 +               */
 +              r = kvmhv_p9_tm_emulation(vcpu);
 +              break;
 +#endif
 +
 +      case BOOK3S_INTERRUPT_HV_RM_HARD:
 +              vcpu->arch.trap = 0;
 +              r = RESUME_GUEST;
 +              if (!xive_enabled())
 +                      kvmppc_xics_rm_complete(vcpu, 0);
 +              break;
 +      default:
 +              r = RESUME_HOST;
 +              break;
 +      }
 +
 +      return r;
 +}
 +
  static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
                                            struct kvm_sregs *sregs)
  {
@@@ -1723,9 -1555,6 +1723,9 @@@ static int kvmppc_get_one_reg_hv(struc
        case KVM_REG_PPC_ONLINE:
                *val = get_reg_val(id, vcpu->arch.online);
                break;
 +      case KVM_REG_PPC_PTCR:
 +              *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
 +              break;
        default:
                r = -EINVAL;
                break;
@@@ -1957,9 -1786,6 +1957,9 @@@ static int kvmppc_set_one_reg_hv(struc
                        atomic_dec(&vcpu->arch.vcore->online_count);
                vcpu->arch.online = i;
                break;
 +      case KVM_REG_PPC_PTCR:
 +              vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
 +              break;
        default:
                r = -EINVAL;
                break;
@@@ -2193,18 -2019,15 +2193,18 @@@ static struct kvm_vcpu *kvmppc_core_vcp
         * Set the default HFSCR for the guest from the host value.
         * This value is only used on POWER9.
         * On POWER9, we want to virtualize the doorbell facility, so we
 -       * turn off the HFSCR bit, which causes those instructions to trap.
 +       * don't set the HFSCR_MSGP bit, and that causes those instructions
 +       * to trap and then we emulate them.
         */
 -      vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
 -      if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
 +      vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
 +              HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
 +      if (cpu_has_feature(CPU_FTR_HVMODE)) {
 +              vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
 +              if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
 +                      vcpu->arch.hfscr |= HFSCR_TM;
 +      }
 +      if (cpu_has_feature(CPU_FTR_TM_COMP))
                vcpu->arch.hfscr |= HFSCR_TM;
 -      else if (!cpu_has_feature(CPU_FTR_TM_COMP))
 -              vcpu->arch.hfscr &= ~HFSCR_TM;
 -      if (cpu_has_feature(CPU_FTR_ARCH_300))
 -              vcpu->arch.hfscr &= ~HFSCR_MSGP;
  
        kvmppc_mmu_book3s_hv_init(vcpu);
  
@@@ -2337,8 -2160,7 +2337,7 @@@ static void kvmppc_set_timer(struct kvm
                kvmppc_core_prepare_to_enter(vcpu);
                return;
        }
-       dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
-                  / tb_ticks_per_sec;
+       dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
        hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
        vcpu->arch.timer_running = 1;
  }
@@@ -2419,18 -2241,10 +2418,18 @@@ static void kvmppc_release_hwthread(in
  
  static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
  {
 +      struct kvm_nested_guest *nested = vcpu->arch.nested;
 +      cpumask_t *cpu_in_guest;
        int i;
  
        cpu = cpu_first_thread_sibling(cpu);
 -      cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
 +      if (nested) {
 +              cpumask_set_cpu(cpu, &nested->need_tlb_flush);
 +              cpu_in_guest = &nested->cpu_in_guest;
 +      } else {
 +              cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
 +              cpu_in_guest = &kvm->arch.cpu_in_guest;
 +      }
        /*
         * Make sure setting of bit in need_tlb_flush precedes
         * testing of cpu_in_guest bits.  The matching barrier on
         */
        smp_mb();
        for (i = 0; i < threads_per_core; ++i)
 -              if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
 +              if (cpumask_test_cpu(cpu + i, cpu_in_guest))
                        smp_call_function_single(cpu + i, do_nothing, NULL, 1);
  }
  
  static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
  {
 +      struct kvm_nested_guest *nested = vcpu->arch.nested;
        struct kvm *kvm = vcpu->kvm;
 +      int prev_cpu;
 +
 +      if (!cpu_has_feature(CPU_FTR_HVMODE))
 +              return;
 +
 +      if (nested)
 +              prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
 +      else
 +              prev_cpu = vcpu->arch.prev_cpu;
  
        /*
         * With radix, the guest can do TLB invalidations itself,
         * ran to flush the TLB.  The TLB is shared between threads,
         * so we use a single bit in .need_tlb_flush for all 4 threads.
         */
 -      if (vcpu->arch.prev_cpu != pcpu) {
 -              if (vcpu->arch.prev_cpu >= 0 &&
 -                  cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
 +      if (prev_cpu != pcpu) {
 +              if (prev_cpu >= 0 &&
 +                  cpu_first_thread_sibling(prev_cpu) !=
                    cpu_first_thread_sibling(pcpu))
 -                      radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
 -              vcpu->arch.prev_cpu = pcpu;
 +                      radix_flush_cpu(kvm, prev_cpu, vcpu);
 +              if (nested)
 +                      nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
 +              else
 +                      vcpu->arch.prev_cpu = pcpu;
 +      }
 +}
 +
 +static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
 +                                            struct kvm_nested_guest *nested)
 +{
 +      cpumask_t *need_tlb_flush;
 +      int lpid;
 +
 +      if (!cpu_has_feature(CPU_FTR_HVMODE))
 +              return;
 +
 +      if (cpu_has_feature(CPU_FTR_ARCH_300))
 +              pcpu &= ~0x3UL;
 +
 +      if (nested) {
 +              lpid = nested->shadow_lpid;
 +              need_tlb_flush = &nested->need_tlb_flush;
 +      } else {
 +              lpid = kvm->arch.lpid;
 +              need_tlb_flush = &kvm->arch.need_tlb_flush;
 +      }
 +
 +      mtspr(SPRN_LPID, lpid);
 +      isync();
 +      smp_mb();
 +
 +      if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
 +              radix__local_flush_tlb_lpid_guest(lpid);
 +              /* Clear the bit after the TLB flush */
 +              cpumask_clear_cpu(pcpu, need_tlb_flush);
        }
  }
  
@@@ -2722,10 -2492,6 +2721,10 @@@ static bool can_dynamic_split(struct kv
        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
                return false;
  
 +      /* In one_vm_per_core mode, require all vcores to be from the same vm */
 +      if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
 +              return false;
 +
        /* Some POWER9 chips require all threads to be in the same MMU mode */
        if (no_mixing_hpt_and_radix &&
            kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
@@@ -2833,14 -2599,6 +2832,14 @@@ static void post_guest_process(struct k
        spin_lock(&vc->lock);
        now = get_tb();
        for_each_runnable_thread(i, vcpu, vc) {
 +              /*
 +               * It's safe to unlock the vcore in the loop here, because
 +               * for_each_runnable_thread() is safe against removal of
 +               * the vcpu, and the vcore state is VCORE_EXITING here,
 +               * so any vcpus becoming runnable will have their arch.trap
 +               * set to zero and can't actually run in the guest.
 +               */
 +              spin_unlock(&vc->lock);
                /* cancel pending dec exception if dec is positive */
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
  
 +              spin_lock(&vc->lock);
                if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
                        if (vcpu->arch.pending_exceptions)
                                kvmppc_core_prepare_to_enter(vcpu);
@@@ -3205,6 -2962,8 +3204,6 @@@ static noinline void kvmppc_run_core(st
                spin_unlock(&core_info.vc[sub]->lock);
  
        if (kvm_is_radix(vc->kvm)) {
 -              int tmp = pcpu;
 -
                /*
                 * Do we need to flush the process scoped TLB for the LPAR?
                 *
                 *
                 * Hash must be flushed in realmode in order to use tlbiel.
                 */
 -              mtspr(SPRN_LPID, vc->kvm->arch.lpid);
 -              isync();
 -
 -              if (cpu_has_feature(CPU_FTR_ARCH_300))
 -                      tmp &= ~0x3UL;
 -
 -              if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
 -                      radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
 -                      /* Clear the bit after the TLB flush */
 -                      cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
 -              }
 +              kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
        }
  
        /*
        trace_kvmppc_run_core(vc, 1);
  }
  
 +/*
 + * Load up hypervisor-mode registers on P9.
 + */
 +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 +                                   unsigned long lpcr)
 +{
 +      struct kvmppc_vcore *vc = vcpu->arch.vcore;
 +      s64 hdec;
 +      u64 tb, purr, spurr;
 +      int trap;
 +      unsigned long host_hfscr = mfspr(SPRN_HFSCR);
 +      unsigned long host_ciabr = mfspr(SPRN_CIABR);
 +      unsigned long host_dawr = mfspr(SPRN_DAWR);
 +      unsigned long host_dawrx = mfspr(SPRN_DAWRX);
 +      unsigned long host_psscr = mfspr(SPRN_PSSCR);
 +      unsigned long host_pidr = mfspr(SPRN_PID);
 +
 +      hdec = time_limit - mftb();
 +      if (hdec < 0)
 +              return BOOK3S_INTERRUPT_HV_DECREMENTER;
 +      mtspr(SPRN_HDEC, hdec);
 +
 +      if (vc->tb_offset) {
 +              u64 new_tb = mftb() + vc->tb_offset;
 +              mtspr(SPRN_TBU40, new_tb);
 +              tb = mftb();
 +              if ((tb & 0xffffff) < (new_tb & 0xffffff))
 +                      mtspr(SPRN_TBU40, new_tb + 0x1000000);
 +              vc->tb_offset_applied = vc->tb_offset;
 +      }
 +
 +      if (vc->pcr)
 +              mtspr(SPRN_PCR, vc->pcr);
 +      mtspr(SPRN_DPDES, vc->dpdes);
 +      mtspr(SPRN_VTB, vc->vtb);
 +
 +      local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
 +      local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
 +      mtspr(SPRN_PURR, vcpu->arch.purr);
 +      mtspr(SPRN_SPURR, vcpu->arch.spurr);
 +
 +      if (cpu_has_feature(CPU_FTR_DAWR)) {
 +              mtspr(SPRN_DAWR, vcpu->arch.dawr);
 +              mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
 +      }
 +      mtspr(SPRN_CIABR, vcpu->arch.ciabr);
 +      mtspr(SPRN_IC, vcpu->arch.ic);
 +      mtspr(SPRN_PID, vcpu->arch.pid);
 +
 +      mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
 +            (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
 +
 +      mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
 +
 +      mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
 +      mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
 +      mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
 +      mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
 +
 +      mtspr(SPRN_AMOR, ~0UL);
 +
 +      mtspr(SPRN_LPCR, lpcr);
 +      isync();
 +
 +      kvmppc_xive_push_vcpu(vcpu);
 +
 +      mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
 +      mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
 +
 +      trap = __kvmhv_vcpu_entry_p9(vcpu);
 +
 +      /* Advance host PURR/SPURR by the amount used by guest */
 +      purr = mfspr(SPRN_PURR);
 +      spurr = mfspr(SPRN_SPURR);
 +      mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
 +            purr - vcpu->arch.purr);
 +      mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
 +            spurr - vcpu->arch.spurr);
 +      vcpu->arch.purr = purr;
 +      vcpu->arch.spurr = spurr;
 +
 +      vcpu->arch.ic = mfspr(SPRN_IC);
 +      vcpu->arch.pid = mfspr(SPRN_PID);
 +      vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
 +
 +      vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
 +      vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
 +      vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
 +      vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
 +
 +      mtspr(SPRN_PSSCR, host_psscr);
 +      mtspr(SPRN_HFSCR, host_hfscr);
 +      mtspr(SPRN_CIABR, host_ciabr);
 +      mtspr(SPRN_DAWR, host_dawr);
 +      mtspr(SPRN_DAWRX, host_dawrx);
 +      mtspr(SPRN_PID, host_pidr);
 +
 +      /*
 +       * Since this is radix, do a eieio; tlbsync; ptesync sequence in
 +       * case we interrupted the guest between a tlbie and a ptesync.
 +       */
 +      asm volatile("eieio; tlbsync; ptesync");
 +
 +      mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);    /* restore host LPID */
 +      isync();
 +
 +      vc->dpdes = mfspr(SPRN_DPDES);
 +      vc->vtb = mfspr(SPRN_VTB);
 +      mtspr(SPRN_DPDES, 0);
 +      if (vc->pcr)
 +              mtspr(SPRN_PCR, 0);
 +
 +      if (vc->tb_offset_applied) {
 +              u64 new_tb = mftb() - vc->tb_offset_applied;
 +              mtspr(SPRN_TBU40, new_tb);
 +              tb = mftb();
 +              if ((tb & 0xffffff) < (new_tb & 0xffffff))
 +                      mtspr(SPRN_TBU40, new_tb + 0x1000000);
 +              vc->tb_offset_applied = 0;
 +      }
 +
 +      mtspr(SPRN_HDEC, 0x7fffffff);
 +      mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
 +
 +      return trap;
 +}
 +
 +/*
 + * Virtual-mode guest entry for POWER9 and later when the host and
 + * guest are both using the radix MMU.  The LPIDR has already been set.
 + */
 +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 +                       unsigned long lpcr)
 +{
 +      struct kvmppc_vcore *vc = vcpu->arch.vcore;
 +      unsigned long host_dscr = mfspr(SPRN_DSCR);
 +      unsigned long host_tidr = mfspr(SPRN_TIDR);
 +      unsigned long host_iamr = mfspr(SPRN_IAMR);
 +      s64 dec;
 +      u64 tb;
 +      int trap, save_pmu;
 +
 +      dec = mfspr(SPRN_DEC);
 +      tb = mftb();
 +      if (dec < 512)
 +              return BOOK3S_INTERRUPT_HV_DECREMENTER;
 +      local_paca->kvm_hstate.dec_expires = dec + tb;
 +      if (local_paca->kvm_hstate.dec_expires < time_limit)
 +              time_limit = local_paca->kvm_hstate.dec_expires;
 +
 +      vcpu->arch.ceded = 0;
 +
 +      kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
 +
 +      kvmppc_subcore_enter_guest();
 +
 +      vc->entry_exit_map = 1;
 +      vc->in_guest = 1;
 +
 +      if (vcpu->arch.vpa.pinned_addr) {
 +              struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
 +              u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
 +              lp->yield_count = cpu_to_be32(yield_count);
 +              vcpu->arch.vpa.dirty = 1;
 +      }
 +
 +      if (cpu_has_feature(CPU_FTR_TM) ||
 +          cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
 +              kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
 +
 +      kvmhv_load_guest_pmu(vcpu);
 +
 +      msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
 +      load_fp_state(&vcpu->arch.fp);
 +#ifdef CONFIG_ALTIVEC
 +      load_vr_state(&vcpu->arch.vr);
 +#endif
 +
 +      mtspr(SPRN_DSCR, vcpu->arch.dscr);
 +      mtspr(SPRN_IAMR, vcpu->arch.iamr);
 +      mtspr(SPRN_PSPB, vcpu->arch.pspb);
 +      mtspr(SPRN_FSCR, vcpu->arch.fscr);
 +      mtspr(SPRN_TAR, vcpu->arch.tar);
 +      mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
 +      mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
 +      mtspr(SPRN_BESCR, vcpu->arch.bescr);
 +      mtspr(SPRN_WORT, vcpu->arch.wort);
 +      mtspr(SPRN_TIDR, vcpu->arch.tid);
 +      mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
 +      mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
 +      mtspr(SPRN_AMR, vcpu->arch.amr);
 +      mtspr(SPRN_UAMOR, vcpu->arch.uamor);
 +
 +      if (!(vcpu->arch.ctrl & 1))
 +              mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
 +
 +      mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
 +
 +      if (kvmhv_on_pseries()) {
 +              /* call our hypervisor to load up HV regs and go */
 +              struct hv_guest_state hvregs;
 +
 +              kvmhv_save_hv_regs(vcpu, &hvregs);
 +              hvregs.lpcr = lpcr;
 +              vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
 +              hvregs.version = HV_GUEST_STATE_VERSION;
 +              if (vcpu->arch.nested) {
 +                      hvregs.lpid = vcpu->arch.nested->shadow_lpid;
 +                      hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
 +              } else {
 +                      hvregs.lpid = vcpu->kvm->arch.lpid;
 +                      hvregs.vcpu_token = vcpu->vcpu_id;
 +              }
 +              hvregs.hdec_expiry = time_limit;
 +              trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
 +                                        __pa(&vcpu->arch.regs));
 +              kvmhv_restore_hv_return_state(vcpu, &hvregs);
 +              vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
 +              vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
 +              vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
 +
 +              /* H_CEDE has to be handled now, not later */
 +              if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
 +                  kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
 +                      kvmppc_nested_cede(vcpu);
 +                      trap = 0;
 +              }
 +      } else {
 +              trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
 +      }
 +
 +      vcpu->arch.slb_max = 0;
 +      dec = mfspr(SPRN_DEC);
 +      tb = mftb();
 +      vcpu->arch.dec_expires = dec + tb;
 +      vcpu->cpu = -1;
 +      vcpu->arch.thread_cpu = -1;
 +      vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
 +
 +      vcpu->arch.iamr = mfspr(SPRN_IAMR);
 +      vcpu->arch.pspb = mfspr(SPRN_PSPB);
 +      vcpu->arch.fscr = mfspr(SPRN_FSCR);
 +      vcpu->arch.tar = mfspr(SPRN_TAR);
 +      vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
 +      vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
 +      vcpu->arch.bescr = mfspr(SPRN_BESCR);
 +      vcpu->arch.wort = mfspr(SPRN_WORT);
 +      vcpu->arch.tid = mfspr(SPRN_TIDR);
 +      vcpu->arch.amr = mfspr(SPRN_AMR);
 +      vcpu->arch.uamor = mfspr(SPRN_UAMOR);
 +      vcpu->arch.dscr = mfspr(SPRN_DSCR);
 +
 +      mtspr(SPRN_PSPB, 0);
 +      mtspr(SPRN_WORT, 0);
 +      mtspr(SPRN_AMR, 0);
 +      mtspr(SPRN_UAMOR, 0);
 +      mtspr(SPRN_DSCR, host_dscr);
 +      mtspr(SPRN_TIDR, host_tidr);
 +      mtspr(SPRN_IAMR, host_iamr);
 +      mtspr(SPRN_PSPB, 0);
 +
 +      msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
 +      store_fp_state(&vcpu->arch.fp);
 +#ifdef CONFIG_ALTIVEC
 +      store_vr_state(&vcpu->arch.vr);
 +#endif
 +
 +      if (cpu_has_feature(CPU_FTR_TM) ||
 +          cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
 +              kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
 +
 +      save_pmu = 1;
 +      if (vcpu->arch.vpa.pinned_addr) {
 +              struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
 +              u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
 +              lp->yield_count = cpu_to_be32(yield_count);
 +              vcpu->arch.vpa.dirty = 1;
 +              save_pmu = lp->pmcregs_in_use;
 +      }
 +
 +      kvmhv_save_guest_pmu(vcpu, save_pmu);
 +
 +      vc->entry_exit_map = 0x101;
 +      vc->in_guest = 0;
 +
 +      mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
 +
 +      kvmhv_load_host_pmu();
 +
 +      kvmppc_subcore_exit_guest();
 +
 +      return trap;
 +}
 +
  /*
   * Wait for some other vcpu thread to execute us, and
   * wake us up when we need to handle something in the host.
        trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
  }
  
 +/*
 + * This never fails for a radix guest, as none of the operations it does
 + * for a radix guest can fail or have a way to report failure.
 + * kvmhv_run_single_vcpu() relies on this fact.
 + */
  static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
  {
        int r = 0;
@@@ -3934,171 -3404,6 +3933,171 @@@ static int kvmppc_run_vcpu(struct kvm_r
        return vcpu->arch.ret;
  }
  
 +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
 +                        struct kvm_vcpu *vcpu, u64 time_limit,
 +                        unsigned long lpcr)
 +{
 +      int trap, r, pcpu;
 +      int srcu_idx;
 +      struct kvmppc_vcore *vc;
 +      struct kvm *kvm = vcpu->kvm;
 +      struct kvm_nested_guest *nested = vcpu->arch.nested;
 +
 +      trace_kvmppc_run_vcpu_enter(vcpu);
 +
 +      kvm_run->exit_reason = 0;
 +      vcpu->arch.ret = RESUME_GUEST;
 +      vcpu->arch.trap = 0;
 +
 +      vc = vcpu->arch.vcore;
 +      vcpu->arch.ceded = 0;
 +      vcpu->arch.run_task = current;
 +      vcpu->arch.kvm_run = kvm_run;
 +      vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
 +      vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 +      vcpu->arch.busy_preempt = TB_NIL;
 +      vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
 +      vc->runnable_threads[0] = vcpu;
 +      vc->n_runnable = 1;
 +      vc->runner = vcpu;
 +
 +      /* See if the MMU is ready to go */
 +      if (!kvm->arch.mmu_ready)
 +              kvmhv_setup_mmu(vcpu);
 +
 +      if (need_resched())
 +              cond_resched();
 +
 +      kvmppc_update_vpas(vcpu);
 +
 +      init_vcore_to_run(vc);
 +      vc->preempt_tb = TB_NIL;
 +
 +      preempt_disable();
 +      pcpu = smp_processor_id();
 +      vc->pcpu = pcpu;
 +      kvmppc_prepare_radix_vcpu(vcpu, pcpu);
 +
 +      local_irq_disable();
 +      hard_irq_disable();
 +      if (signal_pending(current))
 +              goto sigpend;
 +      if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
 +              goto out;
 +
 +      if (!nested) {
 +              kvmppc_core_prepare_to_enter(vcpu);
 +              if (vcpu->arch.doorbell_request) {
 +                      vc->dpdes = 1;
 +                      smp_wmb();
 +                      vcpu->arch.doorbell_request = 0;
 +              }
 +              if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
 +                           &vcpu->arch.pending_exceptions))
 +                      lpcr |= LPCR_MER;
 +      } else if (vcpu->arch.pending_exceptions ||
 +                 vcpu->arch.doorbell_request ||
 +                 xive_interrupt_pending(vcpu)) {
 +              vcpu->arch.ret = RESUME_HOST;
 +              goto out;
 +      }
 +
 +      kvmppc_clear_host_core(pcpu);
 +
 +      local_paca->kvm_hstate.tid = 0;
 +      local_paca->kvm_hstate.napping = 0;
 +      local_paca->kvm_hstate.kvm_split_mode = NULL;
 +      kvmppc_start_thread(vcpu, vc);
 +      kvmppc_create_dtl_entry(vcpu, vc);
 +      trace_kvm_guest_enter(vcpu);
 +
 +      vc->vcore_state = VCORE_RUNNING;
 +      trace_kvmppc_run_core(vc, 0);
 +
 +      if (cpu_has_feature(CPU_FTR_HVMODE))
 +              kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
 +
 +      trace_hardirqs_on();
 +      guest_enter_irqoff();
 +
 +      srcu_idx = srcu_read_lock(&kvm->srcu);
 +
 +      this_cpu_disable_ftrace();
 +
 +      trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
 +      vcpu->arch.trap = trap;
 +
 +      this_cpu_enable_ftrace();
 +
 +      srcu_read_unlock(&kvm->srcu, srcu_idx);
 +
 +      if (cpu_has_feature(CPU_FTR_HVMODE)) {
 +              mtspr(SPRN_LPID, kvm->arch.host_lpid);
 +              isync();
 +      }
 +
 +      trace_hardirqs_off();
 +      set_irq_happened(trap);
 +
 +      kvmppc_set_host_core(pcpu);
 +
 +      local_irq_enable();
 +      guest_exit();
 +
 +      cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
 +
 +      preempt_enable();
 +
 +      /* cancel pending decrementer exception if DEC is now positive */
 +      if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
 +              kvmppc_core_dequeue_dec(vcpu);
 +
 +      trace_kvm_guest_exit(vcpu);
 +      r = RESUME_GUEST;
 +      if (trap) {
 +              if (!nested)
 +                      r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
 +              else
 +                      r = kvmppc_handle_nested_exit(vcpu);
 +      }
 +      vcpu->arch.ret = r;
 +
 +      if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
 +          !kvmppc_vcpu_woken(vcpu)) {
 +              kvmppc_set_timer(vcpu);
 +              while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
 +                      if (signal_pending(current)) {
 +                              vcpu->stat.signal_exits++;
 +                              kvm_run->exit_reason = KVM_EXIT_INTR;
 +                              vcpu->arch.ret = -EINTR;
 +                              break;
 +                      }
 +                      spin_lock(&vc->lock);
 +                      kvmppc_vcore_blocked(vc);
 +                      spin_unlock(&vc->lock);
 +              }
 +      }
 +      vcpu->arch.ceded = 0;
 +
 +      vc->vcore_state = VCORE_INACTIVE;
 +      trace_kvmppc_run_core(vc, 1);
 +
 + done:
 +      kvmppc_remove_runnable(vc, vcpu);
 +      trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
 +
 +      return vcpu->arch.ret;
 +
 + sigpend:
 +      vcpu->stat.signal_exits++;
 +      kvm_run->exit_reason = KVM_EXIT_INTR;
 +      vcpu->arch.ret = -EINTR;
 + out:
 +      local_irq_enable();
 +      preempt_enable();
 +      goto done;
 +}
 +
  static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
  {
        int r;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
  
        do {
 -              r = kvmppc_run_vcpu(run, vcpu);
 +              /*
 +               * The early POWER9 chips that can't mix radix and HPT threads
 +               * on the same core also need the workaround for the problem
 +               * where the TLB would prefetch entries in the guest exit path
 +               * for radix guests using the guest PIDR value and LPID 0.
 +               * The workaround is in the old path (kvmppc_run_vcpu())
 +               * but not the new path (kvmhv_run_single_vcpu()).
 +               */
 +              if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
 +                  !no_mixing_hpt_and_radix)
 +                      r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
 +                                                vcpu->arch.vcore->lpcr);
 +              else
 +                      r = kvmppc_run_vcpu(run, vcpu);
  
                if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
                    !(vcpu->arch.shregs.msr & MSR_PR)) {
@@@ -4266,10 -3558,6 +4265,10 @@@ static int kvm_vm_ioctl_get_smmu_info_h
        kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
        kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
  
 +      /* If running as a nested hypervisor, we don't support HPT guests */
 +      if (kvmhv_on_pseries())
 +              info->flags |= KVM_PPC_NO_HASH;
 +
        return 0;
  }
  
@@@ -4434,7 -3722,8 +4433,7 @@@ void kvmppc_setup_partition_table(struc
                        __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
                dw1 = PATB_GR | kvm->arch.process_table;
        }
 -
 -      mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 +      kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
  }
  
  /*
@@@ -4530,8 -3819,6 +4529,8 @@@ static int kvmppc_hv_setup_htab_rma(str
  /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
  int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
  {
 +      if (nesting_enabled(kvm))
 +              kvmhv_release_all_nested(kvm);
        kvmppc_free_radix(kvm);
        kvmppc_update_lpcr(kvm, LPCR_VPM1,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
@@@ -4553,7 -3840,6 +4552,7 @@@ int kvmppc_switch_mmu_to_radix(struct k
        kvmppc_free_hpt(&kvm->arch.hpt);
        kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
 +      kvmppc_rmap_reset(kvm);
        kvm->arch.radix = 1;
        return 0;
  }
@@@ -4653,8 -3939,6 +4652,8 @@@ static int kvmppc_core_init_vm_hv(struc
  
        kvmppc_alloc_host_rm_ops();
  
 +      kvmhv_vm_nested_init(kvm);
 +
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
                kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
  
        /* Init LPCR for virtual RMA mode */
 -      kvm->arch.host_lpid = mfspr(SPRN_LPID);
 -      kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
 -      lpcr &= LPCR_PECE | LPCR_LPES;
 +      if (cpu_has_feature(CPU_FTR_HVMODE)) {
 +              kvm->arch.host_lpid = mfspr(SPRN_LPID);
 +              kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
 +              lpcr &= LPCR_PECE | LPCR_LPES;
 +      } else {
 +              lpcr = 0;
 +      }
        lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
                LPCR_VPM0 | LPCR_VPM1;
        kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
         * On POWER9, we only need to do this if the "indep_threads_mode"
         * module parameter has been set to N.
         */
 -      if (cpu_has_feature(CPU_FTR_ARCH_300))
 -              kvm->arch.threads_indep = indep_threads_mode;
 +      if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 +              if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
 +                      pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
 +                      kvm->arch.threads_indep = true;
 +              } else {
 +                      kvm->arch.threads_indep = indep_threads_mode;
 +              }
 +      }
        if (!kvm->arch.threads_indep)
                kvm_hv_vm_activated();
  
        snprintf(buf, sizeof(buf), "vm%d", current->pid);
        kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
        kvmppc_mmu_debugfs_init(kvm);
 +      if (radix_enabled())
 +              kvmhv_radix_debugfs_init(kvm);
  
        return 0;
  }
@@@ -4800,21 -4072,13 +4799,21 @@@ static void kvmppc_core_destroy_vm_hv(s
  
        kvmppc_free_vcores(kvm);
  
 -      kvmppc_free_lpid(kvm->arch.lpid);
  
        if (kvm_is_radix(kvm))
                kvmppc_free_radix(kvm);
        else
                kvmppc_free_hpt(&kvm->arch.hpt);
  
 +      /* Perform global invalidation and return lpid to the pool */
 +      if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 +              if (nesting_enabled(kvm))
 +                      kvmhv_release_all_nested(kvm);
 +              kvm->arch.process_table = 0;
 +              kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
 +      }
 +      kvmppc_free_lpid(kvm->arch.lpid);
 +
        kvmppc_free_pimap(kvm);
  }
  
@@@ -4839,15 -4103,11 +4838,15 @@@ static int kvmppc_core_emulate_mfspr_hv
  
  static int kvmppc_core_check_processor_compat_hv(void)
  {
 -      if (!cpu_has_feature(CPU_FTR_HVMODE) ||
 -          !cpu_has_feature(CPU_FTR_ARCH_206))
 -              return -EIO;
 +      if (cpu_has_feature(CPU_FTR_HVMODE) &&
 +          cpu_has_feature(CPU_FTR_ARCH_206))
 +              return 0;
  
 -      return 0;
 +      /* POWER9 in radix mode is capable of being a nested hypervisor. */
 +      if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
 +              return 0;
 +
 +      return -EIO;
  }
  
  #ifdef CONFIG_KVM_XICS
@@@ -5165,10 -4425,6 +5164,10 @@@ static int kvmhv_configure_mmu(struct k
        if (radix && !radix_enabled())
                return -EINVAL;
  
 +      /* If we're a nested hypervisor, we currently only support radix */
 +      if (kvmhv_on_pseries() && !radix)
 +              return -EINVAL;
 +
        mutex_lock(&kvm->lock);
        if (radix != kvm_is_radix(kvm)) {
                if (kvm->arch.mmu_ready) {
        return err;
  }
  
 +static int kvmhv_enable_nested(struct kvm *kvm)
 +{
 +      if (!nested)
 +              return -EPERM;
 +      if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
 +              return -ENODEV;
 +
 +      /* kvm == NULL means the caller is testing if the capability exists */
 +      if (kvm)
 +              kvm->arch.nested_enable = true;
 +      return 0;
 +}
 +
  static struct kvmppc_ops kvm_ops_hv = {
        .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
        .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
        .configure_mmu = kvmhv_configure_mmu,
        .get_rmmu_info = kvmhv_get_rmmu_info,
        .set_smt_mode = kvmhv_set_smt_mode,
 +      .enable_nested = kvmhv_enable_nested,
  };
  
  static int kvm_init_subcore_bitmap(void)
@@@ -5304,10 -4546,6 +5303,10 @@@ static int kvmppc_book3s_init_hv(void
        if (r < 0)
                return -ENODEV;
  
 +      r = kvmhv_nested_init();
 +      if (r)
 +              return r;
 +
        r = kvm_init_subcore_bitmap();
        if (r)
                return r;
         * indirectly, via OPAL.
         */
  #ifdef CONFIG_SMP
 -      if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
 +      if (!xive_enabled() && !kvmhv_on_pseries() &&
 +          !local_paca->kvm_hstate.xics_phys) {
                struct device_node *np;
  
                np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
@@@ -5367,7 -4604,6 +5366,7 @@@ static void kvmppc_book3s_exit_hv(void
        if (kvmppc_radix_possible())
                kvmppc_radix_exit();
        kvmppc_hv_ops = NULL;
 +      kvmhv_nested_exit();
  }
  
  module_init(kvmppc_book3s_init_hv);