Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 8 Mar 2019 03:25:37 +0000 (19:25 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 8 Mar 2019 03:25:37 +0000 (19:25 -0800)
Merge more updates from Andrew Morton:

 - some of the rest of MM

 - various misc things

 - dynamic-debug updates

 - checkpatch

 - some epoll speedups

 - autofs

 - rapidio

 - lib/, lib/lzo/ updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (83 commits)
  samples/mic/mpssd/mpssd.h: remove duplicate header
  kernel/fork.c: remove duplicated include
  include/linux/relay.h: fix percpu annotation in struct rchan
  arch/nios2/mm/fault.c: remove duplicate include
  unicore32: stop printing the virtual memory layout
  MAINTAINERS: fix GTA02 entry and mark as orphan
  mm: create the new vm_fault_t type
  arm, s390, unicore32: remove oneliner wrappers for memblock_alloc()
  arch: simplify several early memory allocations
  openrisc: simplify pte_alloc_one_kernel()
  sh: prefer memblock APIs returning virtual address
  microblaze: prefer memblock API returning virtual address
  powerpc: prefer memblock APIs returning virtual address
  lib/lzo: separate lzo-rle from lzo
  lib/lzo: implement run-length encoding
  lib/lzo: fast 8-byte copy on arm64
  lib/lzo: 64-bit CTZ on arm64
  lib/lzo: tidy-up ifdefs
  ipc/sem.c: replace kvmalloc/memset with kvzalloc and use struct_size
  ipc: annotate implicit fall through
  ...

547 files changed:
Documentation/admin-guide/LSM/SafeSetID.rst [new file with mode: 0644]
Documentation/admin-guide/LSM/index.rst
Documentation/admin-guide/cgroup-v2.rst
Documentation/admin-guide/kernel-parameters.txt
Documentation/cgroup-v1/pids.txt
Documentation/filesystems/xfs.txt
Documentation/kdump/vmcoreinfo.txt [new file with mode: 0644]
Documentation/xtensa/booting.txt [new file with mode: 0644]
MAINTAINERS
arch/powerpc/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/Makefile
arch/powerpc/boot/dts/Makefile
arch/powerpc/boot/dts/akebono.dts
arch/powerpc/boot/dts/bluestone.dts
arch/powerpc/boot/dts/currituck.dts
arch/powerpc/boot/dts/iss4xx-mpic.dts
arch/powerpc/boot/dts/wii.dts
arch/powerpc/include/asm/asm-prototypes.h
arch/powerpc/include/asm/book3s/32/mmu-hash.h
arch/powerpc/include/asm/book3s/32/pgtable.h
arch/powerpc/include/asm/book3s/64/hash.h
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/book3s/64/pgalloc.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
arch/powerpc/include/asm/checksum.h
arch/powerpc/include/asm/device.h
arch/powerpc/include/asm/dma-direct.h
arch/powerpc/include/asm/dma-mapping.h
arch/powerpc/include/asm/eeh.h
arch/powerpc/include/asm/eeh_event.h
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/hvsi.h
arch/powerpc/include/asm/iommu.h
arch/powerpc/include/asm/ipic.h
arch/powerpc/include/asm/irq.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/livepatch.h
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/mce.h
arch/powerpc/include/asm/mmu.h
arch/powerpc/include/asm/nmi.h
arch/powerpc/include/asm/nohash/32/mmu-8xx.h
arch/powerpc/include/asm/page.h
arch/powerpc/include/asm/pci-bridge.h
arch/powerpc/include/asm/pci.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/powernv.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/include/asm/ppc-pci.h
arch/powerpc/include/asm/processor.h
arch/powerpc/include/asm/ptrace.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/sections.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/swiotlb.h
arch/powerpc/include/asm/task_size_32.h [new file with mode: 0644]
arch/powerpc/include/asm/task_size_64.h [new file with mode: 0644]
arch/powerpc/include/asm/thread_info.h
arch/powerpc/include/asm/topology.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_6xx.S
arch/powerpc/kernel/dma-iommu.c
arch/powerpc/kernel/dma-mask.c [new file with mode: 0644]
arch/powerpc/kernel/dma-swiotlb.c
arch/powerpc/kernel/dma.c [deleted file]
arch/powerpc/kernel/dt_cpu_ftrs.c
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/eeh_cache.c
arch/powerpc/kernel/eeh_driver.c
arch/powerpc/kernel/eeh_event.c
arch/powerpc/kernel/eeh_pe.c
arch/powerpc/kernel/eeh_sysfs.c
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/epapr_hcalls.S
arch/powerpc/kernel/exceptions-64e.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_32.S
arch/powerpc/kernel/head_40x.S
arch/powerpc/kernel/head_44x.S
arch/powerpc/kernel/head_64.S
arch/powerpc/kernel/head_8xx.S
arch/powerpc/kernel/head_booke.h
arch/powerpc/kernel/head_fsl_booke.S
arch/powerpc/kernel/idle_6xx.S
arch/powerpc/kernel/idle_book3e.S
arch/powerpc/kernel/idle_e500.S
arch/powerpc/kernel/idle_power4.S
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/kgdb.c
arch/powerpc/kernel/machine_kexec_64.c
arch/powerpc/kernel/mce.c
arch/powerpc/kernel/misc_32.S
arch/powerpc/kernel/pci-common.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/ptrace.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_32.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/stacktrace.c
arch/powerpc/kernel/syscalls.c
arch/powerpc/kernel/syscalls/syscalltbl.sh
arch/powerpc/kernel/systbl.S
arch/powerpc/kernel/time.c
arch/powerpc/kernel/trace/Makefile
arch/powerpc/kernel/trace/ftrace_64_mprofile.S
arch/powerpc/kernel/traps.c
arch/powerpc/kernel/udbg.c
arch/powerpc/kernel/vdso32/Makefile
arch/powerpc/kernel/vdso64/Makefile
arch/powerpc/kernel/vmlinux.lds.S
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_hmi.c
arch/powerpc/kvm/book3s_hv_ras.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/lib/Makefile
arch/powerpc/lib/sstep.c
arch/powerpc/lib/test_emulate_step.c
arch/powerpc/lib/test_emulate_step_exec_instr.S [new file with mode: 0644]
arch/powerpc/math-emu/Makefile
arch/powerpc/mm/40x_mmu.c
arch/powerpc/mm/44x_mmu.c
arch/powerpc/mm/8xx_mmu.c
arch/powerpc/mm/Makefile
arch/powerpc/mm/dma-noncoherent.c
arch/powerpc/mm/fsl_booke_mmu.c
arch/powerpc/mm/hash_low_32.S
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/hugetlbpage-hash64.c
arch/powerpc/mm/hugetlbpage-radix.c
arch/powerpc/mm/init_32.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/mmu_decl.h
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable_32.c
arch/powerpc/mm/ppc_mmu_32.c
arch/powerpc/mm/ptdump/8xx.c [moved from arch/powerpc/mm/dump_linuxpagetables-8xx.c with 97% similarity]
arch/powerpc/mm/ptdump/Makefile [new file with mode: 0644]
arch/powerpc/mm/ptdump/bats.c [moved from arch/powerpc/mm/dump_bats.c with 100% similarity]
arch/powerpc/mm/ptdump/book3s64.c [moved from arch/powerpc/mm/dump_linuxpagetables-book3s64.c with 98% similarity]
arch/powerpc/mm/ptdump/hashpagetable.c [moved from arch/powerpc/mm/dump_hashpagetable.c with 99% similarity]
arch/powerpc/mm/ptdump/ptdump.c [moved from arch/powerpc/mm/dump_linuxpagetables.c with 94% similarity]
arch/powerpc/mm/ptdump/ptdump.h [moved from arch/powerpc/mm/dump_linuxpagetables.h with 100% similarity]
arch/powerpc/mm/ptdump/segment_regs.c [moved from arch/powerpc/mm/dump_sr.c with 100% similarity]
arch/powerpc/mm/ptdump/shared.c [moved from arch/powerpc/mm/dump_linuxpagetables-generic.c with 97% similarity]
arch/powerpc/mm/slb.c
arch/powerpc/mm/slice.c
arch/powerpc/mm/tlb_nohash.c
arch/powerpc/net/bpf_jit32.h
arch/powerpc/perf/power9-events-list.h
arch/powerpc/perf/power9-pmu.c
arch/powerpc/platforms/44x/Kconfig
arch/powerpc/platforms/44x/ppc476.c
arch/powerpc/platforms/44x/warp.c
arch/powerpc/platforms/83xx/suspend-asm.S
arch/powerpc/platforms/85xx/corenet_generic.c
arch/powerpc/platforms/85xx/ge_imp3a.c
arch/powerpc/platforms/85xx/mpc8536_ds.c
arch/powerpc/platforms/85xx/mpc85xx_ds.c
arch/powerpc/platforms/85xx/mpc85xx_mds.c
arch/powerpc/platforms/85xx/p1010rdb.c
arch/powerpc/platforms/85xx/p1022_ds.c
arch/powerpc/platforms/85xx/p1022_rdk.c
arch/powerpc/platforms/85xx/qemu_e500.c
arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/cell/iommu.c
arch/powerpc/platforms/cell/spu_callbacks.c
arch/powerpc/platforms/cell/spu_syscalls.c
arch/powerpc/platforms/cell/spufs/file.c
arch/powerpc/platforms/embedded6xx/wii.c
arch/powerpc/platforms/pasemi/iommu.c
arch/powerpc/platforms/pasemi/setup.c
arch/powerpc/platforms/powernv/Makefile
arch/powerpc/platforms/powernv/idle.c
arch/powerpc/platforms/powernv/npu-dma.c
arch/powerpc/platforms/powernv/opal-call.c [new file with mode: 0644]
arch/powerpc/platforms/powernv/opal-msglog.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda-tce.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/smp.c
arch/powerpc/platforms/ps3/device-init.c
arch/powerpc/platforms/ps3/os-area.c
arch/powerpc/platforms/ps3/system-bus.c
arch/powerpc/platforms/pseries/hotplug-cpu.c
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/platforms/pseries/lparcfg.c
arch/powerpc/platforms/pseries/vio.c
arch/powerpc/sysdev/6xx-suspend.S
arch/powerpc/sysdev/dart_iommu.c
arch/powerpc/sysdev/fsl_pci.c
arch/powerpc/sysdev/ipic.c
arch/powerpc/sysdev/tsi108_dev.c
arch/powerpc/sysdev/xive/common.c
arch/powerpc/xmon/Makefile
arch/powerpc/xmon/ppc-dis.c
arch/powerpc/xmon/xmon.c
arch/riscv/Kconfig
arch/riscv/include/asm/fixmap.h [new file with mode: 0644]
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/smp.h
arch/riscv/kernel/cpu.c
arch/riscv/kernel/cpufeature.c
arch/riscv/kernel/ftrace.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/smp.c
arch/riscv/kernel/smpboot.c
arch/riscv/mm/init.c
arch/x86/Makefile
arch/x86/boot/Makefile
arch/x86/boot/compressed/Makefile
arch/x86/boot/compressed/acpi.c [new file with mode: 0644]
arch/x86/boot/compressed/cmdline.c
arch/x86/boot/compressed/head_64.S
arch/x86/boot/compressed/kaslr.c
arch/x86/boot/compressed/misc.c
arch/x86/boot/compressed/misc.h
arch/x86/boot/compressed/pgtable_64.c
arch/x86/boot/compressed/vmlinux.lds.S
arch/x86/boot/setup.ld
arch/x86/boot/string.c
arch/x86/boot/string.h
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/events/intel/core.c
arch/x86/events/intel/lbr.c
arch/x86/include/asm/asm-prototypes.h
arch/x86/include/asm/fpu/internal.h
arch/x86/include/asm/fpu/types.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uv/bios.h
arch/x86/kernel/acpi/wakeup_32.S
arch/x86/kernel/acpi/wakeup_64.S
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/cpu/cacheinfo.c
arch/x86/kernel/cpu/mtrr/cleanup.c
arch/x86/kernel/cpu/resctrl/pseudo_lock.c
arch/x86/kernel/e820.c
arch/x86/kernel/fpu/xstate.c
arch/x86/kernel/hw_breakpoint.c
arch/x86/kernel/kexec-bzimage64.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/traps.c
arch/x86/kernel/uprobes.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/lib/insn-eval.c
arch/x86/mm/cpu_entry_area.c
arch/x86/mm/dump_pagetables.c
arch/x86/mm/tlb.c
arch/x86/platform/uv/bios_uv.c
arch/x86/platform/uv/tlb_uv.c
arch/x86/realmode/rm/Makefile
arch/x86/realmode/rm/realmode.lds.S
arch/xtensa/Kconfig
arch/xtensa/include/asm/Kbuild
arch/xtensa/include/asm/cmpxchg.h
arch/xtensa/include/asm/spinlock.h
arch/xtensa/include/asm/spinlock_types.h
arch/xtensa/include/asm/thread_info.h
arch/xtensa/kernel/process.c
arch/xtensa/kernel/smp.c
arch/xtensa/kernel/time.c
arch/xtensa/kernel/traps.c
drivers/misc/cxl/guest.c
drivers/misc/cxl/pci.c
drivers/misc/cxl/vphb.c
drivers/net/ethernet/pasemi/pasemi_mac.c
drivers/tty/tty_audit.c
drivers/vfio/vfio_spapr_eeh.c
fs/Makefile
fs/btrfs/acl.c
fs/btrfs/async-thread.c
fs/btrfs/backref.c
fs/btrfs/compression.c
fs/btrfs/compression.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.c
fs/btrfs/locking.h
fs/btrfs/lzo.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/ref-verify.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-defrag.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/zlib.c
fs/btrfs/zstd.c
fs/ext2/dir.c
fs/ext2/ext2.h
fs/ext2/file.c
fs/ext2/ialloc.c
fs/ext2/inode.c
fs/ext2/namei.c
fs/ext2/super.c
fs/ext2/symlink.c
fs/ext2/xattr.c
fs/fs_types.c [new file with mode: 0644]
fs/namei.c
fs/namespace.c
fs/notify/fanotify/Kconfig
fs/notify/fanotify/fanotify.c
fs/notify/fanotify/fanotify.h
fs/notify/fanotify/fanotify_user.c
fs/notify/fsnotify.c
fs/notify/inotify/inotify.h
fs/notify/inotify/inotify_fsnotify.c
fs/notify/inotify/inotify_user.c
fs/notify/mark.c
fs/notify/notification.c
fs/proc/base.c
fs/proc/internal.h
fs/statfs.c
fs/udf/super.c
fs/xfs/libxfs/xfs_ag.c
fs/xfs/libxfs/xfs_ag_resv.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr.h
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_attr_remote.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_bmap_btree.c
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_da_format.h
fs/xfs/libxfs/xfs_dir2.c
fs/xfs/libxfs/xfs_dir2.h
fs/xfs/libxfs/xfs_dir2_block.c
fs/xfs/libxfs/xfs_dir2_data.c
fs/xfs/libxfs/xfs_dir2_leaf.c
fs/xfs/libxfs/xfs_dir2_node.c
fs/xfs/libxfs/xfs_dquot_buf.c
fs/xfs/libxfs/xfs_errortag.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_ialloc_btree.c
fs/xfs/libxfs/xfs_iext_tree.c
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/libxfs/xfs_inode_fork.h
fs/xfs/libxfs/xfs_refcount_btree.c
fs/xfs/libxfs/xfs_rmap_btree.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/libxfs/xfs_symlink_remote.c
fs/xfs/libxfs/xfs_types.c
fs/xfs/libxfs/xfs_types.h
fs/xfs/scrub/agheader.c
fs/xfs/scrub/agheader_repair.c
fs/xfs/scrub/attr.c
fs/xfs/scrub/bmap.c
fs/xfs/scrub/dir.c
fs/xfs/scrub/ialloc.c
fs/xfs/scrub/repair.c
fs/xfs/scrub/repair.h
fs/xfs/scrub/rtbitmap.c
fs/xfs/scrub/trace.h
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_globals.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iomap.h
fs/xfs/xfs_iops.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h
fs/xfs/xfs_super.c
fs/xfs/xfs_sysctl.h
fs/xfs/xfs_sysfs.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans_bmap.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_extfree.c
fs/xfs/xfs_trans_refcount.c
fs/xfs/xfs_trans_rmap.c
fs/xfs/xfs_xattr.c
include/linux/audit.h
include/linux/capability.h
include/linux/cgroup-defs.h
include/linux/cgroup.h
include/linux/cred.h
include/linux/fanotify.h
include/linux/fs.h
include/linux/fs_types.h [new file with mode: 0644]
include/linux/fsnotify.h
include/linux/fsnotify_backend.h
include/linux/kprobes.h
include/linux/lsm_hooks.h
include/linux/namei.h
include/linux/sched.h
include/linux/security.h
include/linux/selinux.h [deleted file]
include/linux/statfs.h
include/linux/swiotlb.h
include/trace/events/btrfs.h
include/uapi/linux/btrfs.h
include/uapi/linux/fanotify.h
init/init_task.c
kernel/audit.c
kernel/audit.h
kernel/audit_fsnotify.c
kernel/audit_tree.c
kernel/audit_watch.c
kernel/auditfilter.c
kernel/auditsc.c
kernel/capability.c
kernel/cgroup/cgroup.c
kernel/cgroup/cpuset.c
kernel/cgroup/pids.c
kernel/cgroup/rstat.c
kernel/cred.c
kernel/dma/Kconfig
kernel/dma/direct.c
kernel/dma/mapping.c
kernel/dma/swiotlb.c
kernel/exit.c
kernel/resource.c
kernel/seccomp.c
kernel/sys.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_kprobe.c
kernel/workqueue.c
mm/gup.c
mm/percpu-km.c
mm/percpu.c
security/Kconfig
security/Makefile
security/apparmor/Kconfig
security/apparmor/audit.c
security/apparmor/capability.c
security/apparmor/domain.c
security/apparmor/include/audit.h
security/apparmor/include/capability.h
security/apparmor/include/cred.h
security/apparmor/include/file.h
security/apparmor/include/lib.h
security/apparmor/include/task.h
security/apparmor/ipc.c
security/apparmor/lsm.c
security/apparmor/resource.c
security/apparmor/task.c
security/commoncap.c
security/integrity/ima/ima.h
security/integrity/ima/ima_appraise.c
security/integrity/ima/ima_policy.c
security/integrity/ima/ima_template_lib.c
security/keys/keyctl.c
security/keys/keyring.c
security/keys/process_keys.c
security/keys/request_key.c
security/loadpin/loadpin.c
security/safesetid/Kconfig [new file with mode: 0644]
security/safesetid/Makefile [new file with mode: 0644]
security/safesetid/lsm.c [new file with mode: 0644]
security/safesetid/lsm.h [new file with mode: 0644]
security/safesetid/securityfs.c [new file with mode: 0644]
security/security.c
security/selinux/Kconfig
security/selinux/Makefile
security/selinux/avc.c
security/selinux/exports.c [deleted file]
security/selinux/hooks.c
security/selinux/include/audit.h
security/selinux/include/avc.h
security/selinux/include/objsec.h
security/selinux/include/security.h
security/selinux/selinuxfs.c
security/selinux/ss/services.c
security/selinux/xfrm.c
security/smack/smack.h
security/smack/smack_access.c
security/smack/smack_lsm.c
security/smack/smackfs.c
security/tomoyo/audit.c
security/tomoyo/common.c
security/tomoyo/common.h
security/tomoyo/condition.c
security/tomoyo/domain.c
security/tomoyo/file.c
security/tomoyo/gc.c
security/tomoyo/group.c
security/tomoyo/load_policy.c
security/tomoyo/memory.c
security/tomoyo/mount.c
security/tomoyo/realpath.c
security/tomoyo/securityfs_if.c
security/tomoyo/tomoyo.c
security/tomoyo/util.c
security/yama/yama_lsm.c
tools/testing/selftests/powerpc/benchmarks/null_syscall.c
tools/testing/selftests/powerpc/include/reg.h
tools/testing/selftests/powerpc/include/utils.h
tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
tools/testing/selftests/powerpc/tm/.gitignore
tools/testing/selftests/powerpc/tm/Makefile
tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c [new file with mode: 0644]
tools/testing/selftests/safesetid/.gitignore [new file with mode: 0644]
tools/testing/selftests/safesetid/Makefile [new file with mode: 0644]
tools/testing/selftests/safesetid/config [new file with mode: 0644]
tools/testing/selftests/safesetid/safesetid-test.c [new file with mode: 0644]
tools/testing/selftests/safesetid/safesetid-test.sh [new file with mode: 0755]
tools/testing/selftests/vm/map_hugetlb.c

diff --git a/Documentation/admin-guide/LSM/SafeSetID.rst b/Documentation/admin-guide/LSM/SafeSetID.rst
new file mode 100644 (file)
index 0000000..212434e
--- /dev/null
@@ -0,0 +1,107 @@
+=========
+SafeSetID
+=========
+SafeSetID is an LSM module that gates the setid family of syscalls to restrict
+UID/GID transitions from a given UID/GID to only those approved by a
+system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
+from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
+allowing a user to set up user namespace UID mappings.
+
+
+Background
+==========
+In absence of file capabilities, processes spawned on a Linux system that need
+to switch to a different user must be spawned with CAP_SETUID privileges.
+CAP_SETUID is granted to programs running as root or those running as a non-root
+user that have been explicitly given the CAP_SETUID runtime capability. It is
+often preferable to use Linux runtime capabilities rather than file
+capabilities, since using file capabilities to run a program with elevated
+privileges opens up possible security holes since any user with access to the
+file can exec() that program to gain the elevated privileges.
+
+While it is possible to implement a tree of processes by giving full
+CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
+tree of processes under non-root user(s) in the first place. Specifically,
+since CAP_SETUID allows changing to any user on the system, including the root
+user, it is an overpowered capability for what is needed in this scenario,
+especially since programs often only call setuid() to drop privileges to a
+lesser-privileged user -- not elevate privileges. Unfortunately, there is no
+generally feasible way in Linux to restrict the potential UIDs that a user can
+switch to through setuid() beyond allowing a switch to any user on the system.
+This SafeSetID LSM seeks to provide a solution for restricting setid
+capabilities in such a way.
+
+The main use case for this LSM is to allow a non-root program to transition to
+other untrusted uids without full blown CAP_SETUID capabilities. The non-root
+program would still need CAP_SETUID to do any kind of transition, but the
+additional restrictions imposed by this LSM would mean it is a "safer" version
+of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
+do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
+namespace). The higher level goal is to allow for uid-based sandboxing of system
+services without having to give out CAP_SETUID all over the place just so that
+non-root programs can drop to even-lesser-privileged uids. This is especially
+relevant when one non-root daemon on the system should be allowed to spawn other
+processes as different uids, but its undesirable to give the daemon a
+basically-root-equivalent CAP_SETUID.
+
+
+Other Approaches Considered
+===========================
+
+Solve this problem in userspace
+-------------------------------
+For candidate applications that would like to have restricted setid capabilities
+as implemented in this LSM, an alternative option would be to simply take away
+setid capabilities from the application completely and refactor the process
+spawning semantics in the application (e.g. by using a privileged helper program
+to do process spawning and UID/GID transitions). Unfortunately, there are a
+number of semantics around process spawning that would be affected by this, such
+as fork() calls where the program doesn???t immediately call exec() after the
+fork(), parent processes specifying custom environment variables or command line
+args for spawned child processes, or inheritance of file handles across a
+fork()/exec(). Because of this, as solution that uses a privileged helper in
+userspace would likely be less appealing to incorporate into existing projects
+that rely on certain process-spawning semantics in Linux.
+
+Use user namespaces
+-------------------
+Another possible approach would be to run a given process tree in its own user
+namespace and give programs in the tree setid capabilities. In this way,
+programs in the tree could change to any desired UID/GID in the context of their
+own user namespace, and only approved UIDs/GIDs could be mapped back to the
+initial system user namespace, affectively preventing privilege escalation.
+Unfortunately, it is not generally feasible to use user namespaces in isolation,
+without pairing them with other namespace types, which is not always an option.
+Linux checks for capabilities based off of the user namespace that ???owns??? some
+entity. For example, Linux has the notion that network namespaces are owned by
+the user namespace in which they were created. A consequence of this is that
+capability checks for access to a given network namespace are done by checking
+whether a task has the given capability in the context of the user namespace
+that owns the network namespace -- not necessarily the user namespace under
+which the given task runs. Therefore spawning a process in a new user namespace
+effectively prevents it from accessing the network namespace owned by the
+initial namespace. This is a deal-breaker for any application that expects to
+retain the CAP_NET_ADMIN capability for the purpose of adjusting network
+configurations. Using user namespaces in isolation causes problems regarding
+other system interactions, including use of pid namespaces and device creation.
+
+Use an existing LSM
+-------------------
+None of the other in-tree LSMs have the capability to gate setid transitions, or
+even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
+"Since setuid only affects the current process, and since the SELinux controls
+are not based on the Linux identity attributes, SELinux does not need to control
+this operation."
+
+
+Directions for use
+==================
+This LSM hooks the setid syscalls to make sure transitions are allowed if an
+applicable restriction policy is in place. Policies are configured through
+securityfs by writing to the safesetid/add_whitelist_policy and
+safesetid/flush_whitelist_policies files at the location where securityfs is
+mounted. The format for adding a policy is '<UID>:<UID>', using literal
+numbers, such as '123:456'. To flush the policies, any write to the file is
+sufficient. Again, configuring a policy for a UID will prevent that UID from
+obtaining auxiliary setid privileges, such as allowing a user to set up user
+namespace UID mappings.
index c980dfe9abf17afc5431048fd7df58b69562955b..a6ba95fbaa9f105a00cc6594b018661de9075aaa 100644 (file)
@@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
 specific changes to system operation when these tweaks are not available
 in the core functionality of Linux itself.
 
-Without a specific LSM built into the kernel, the default LSM will be the
-Linux capabilities system. Most LSMs choose to extend the capabilities
-system, building their checks on top of the defined capability hooks.
+The Linux capabilities modules will always be included. This may be
+followed by any number of "minor" modules and at most one "major" module.
 For more details on capabilities, see ``capabilities(7)`` in the Linux
 man-pages project.
 
@@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
 be first, followed by any "minor" modules (e.g. Yama) and then
 the one "major" module (e.g. SELinux) if there is one configured.
 
+Process attributes associated with "major" security modules should
+be accessed and maintained using the special files in ``/proc/.../attr``.
+A security module may maintain a module specific subdirectory there,
+named after the module. ``/proc/.../attr/smack`` is provided by the Smack
+security module and contains all its special files. The files directly
+in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
+subdirectories.
+
 .. toctree::
    :maxdepth: 1
 
@@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
    Smack
    tomoyo
    Yama
+   SafeSetID
index 53d3288c328bcfb15723669759776f2bfbacc4e9..20f92c16ffbf2c6e89ae090719d2e7225383985f 100644 (file)
@@ -1519,7 +1519,7 @@ protected workload.
 
 The limits are only applied at the peer level in the hierarchy.  This means that
 in the diagram below, only groups A, B, and C will influence each other, and
-groups D and F will influence each other.  Group G will influence nobody.
+groups D and F will influence each other.  Group G will influence nobody::
 
                        [root]
                /          |            \
index a422560fbc15a7c6fb66afd85db48ed4e53e2b27..42379633801f4741a1af5a03c1153b1230d948eb 100644 (file)
 
        lsm.debug       [SECURITY] Enable LSM initialization debugging output.
 
+       lsm=lsm1,...,lsmN
+                       [SECURITY] Choose order of LSM initialization. This
+                       overrides CONFIG_LSM, and the "security=" parameter.
+
        machvec=        [IA-64] Force the use of a particular machine-vector
                        (machvec) in a generic kernel.
                        Example: machvec=hpzx1_swiotlb
                        Note: increases power consumption, thus should only be
                        enabled if running jitter sensitive (HPC/RT) workloads.
 
-       security=       [SECURITY] Choose a security module to enable at boot.
-                       If this boot parameter is not specified, only the first
-                       security module asking for security registration will be
-                       loaded. An invalid security module name will be treated
-                       as if no module has been chosen.
+       security=       [SECURITY] Choose a legacy "major" security module to
+                       enable at boot. This has been deprecated by the
+                       "lsm=" parameter.
 
        selinux=        [SELINUX] Disable or enable SELinux at boot time.
                        Format: { "0" | "1" }
index 1a078b5d281ae5e22e8ab88a944eb39611cec427..e105d708ccde3630e58e41d4d69e24b6f6512aaa 100644 (file)
@@ -33,6 +33,9 @@ limit in the hierarchy is followed).
 pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
 superset of parent/child/pids.current.
 
+The pids.events file contains event counters:
+  - max: Number of times fork failed because limit was hit.
+
 Example
 -------
 
index 9ccfd1bc6201862e4eae6a5be252287a512151be..a5cbb5e0e3db48b6f6275cc5098845cabd109c4f 100644 (file)
@@ -272,7 +272,7 @@ The following sysctls are available for the XFS filesystem:
                XFS_ERRLEVEL_LOW:       1
                XFS_ERRLEVEL_HIGH:      5
 
-  fs.xfs.panic_mask            (Min: 0  Default: 0  Max: 255)
+  fs.xfs.panic_mask            (Min: 0  Default: 0  Max: 256)
        Causes certain error conditions to call BUG(). Value is a bitmask;
        OR together the tags which represent errors which should cause panics:
 
@@ -285,6 +285,7 @@ The following sysctls are available for the XFS filesystem:
                XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
                XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
                XFS_PTAG_FSBLOCK_ZERO           0x00000080
+               XFS_PTAG_VERIFIER_ERROR         0x00000100
 
        This option is intended for debugging only.
 
diff --git a/Documentation/kdump/vmcoreinfo.txt b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644 (file)
index 0000000..bb94a4b
--- /dev/null
@@ -0,0 +1,495 @@
+================================================================
+                       VMCOREINFO
+================================================================
+
+===========
+What is it?
+===========
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+================
+Common variables
+================
+
+init_uts_ns.name.release
+------------------------
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built. For example, crash uses it to
+find the corresponding vmlinux in order to process vmcore.
+
+PAGE_SIZE
+---------
+
+The size of a page. It is the smallest unit of data used by the memory
+management facilities. It is usually 4096 bytes of size and a page is
+aligned on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+-----------
+
+The UTS namespace which is used to isolate two specific elements of the
+system that relate to the uname(2) system call. It is named after the
+data structure used to store information returned by the uname(2) system
+call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---------------
+
+An array node_states[N_ONLINE] which represents the set of online nodes
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-------------
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+------
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--------------
+
+Stores the virtual area list. makedumpfile gets the vmalloc start value
+from this variable and its value is necessary for vmalloc translation.
+
+mem_map
+-------
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+----------------
+
+Makedumpfile gets the pglist_data structure from this symbol, which is
+used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--------------------------------------------------------------------------
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+----
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute contiguous memory.
+
+pglist_data
+-----------
+
+The size of a pglist_data structure. This value is used to check if the
+pglist_data structure is valid. It is also used for checking the memory
+type.
+
+zone
+----
+
+The size of a zone structure. This value is used to check if the zone
+structure has been found. It is also used for excluding free pages.
+
+free_area
+---------
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful when excluding free pages.
+
+list_head
+---------
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+----------
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+       compound_order|compound_head)
+-------------------------------------------------------------------
+
+User-space tools compute their values based on the offset of these
+variables. The variables are used when excluding unnecessary pages.
+
+(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_
+              spanned_pages|node_id)
+-------------------------------------------------------------------
+
+On NUMA machines, each NUMA node has a pg_data_t to describe its memory
+layout. On UMA machines there is a single pglist_data which describes the
+whole memory.
+
+These values are used to check the memory type and to compute the
+virtual address for memory map.
+
+(zone, free_area|vm_stat|spanned_pages)
+---------------------------------------
+
+Each node is divided into a number of blocks called zones which
+represent ranges within memory. A zone is described by a structure zone.
+
+User-space tools compute required values based on the offset of these
+variables.
+
+(free_area, free_list)
+----------------------
+
+Offset of the free_list's member. This value is used to compute the number
+of free pages.
+
+Each zone has a free_area structure array called free_area[MAX_ORDER].
+The free_list represents a linked list of free page blocks.
+
+(list_head, next|prev)
+----------------------
+
+Offsets of the list_head's members. list_head is used to define a
+circular linked list. User-space tools need these in order to traverse
+lists.
+
+(vmap_area, va_start|list)
+--------------------------
+
+Offsets of the vmap_area's members. They carry vmalloc-specific
+information. Makedumpfile gets the start address of the vmalloc region
+from this.
+
+(zone.free_area, MAX_ORDER)
+---------------------------
+
+Free areas descriptor. User-space tools use this value to iterate the
+free_area ranges. MAX_ORDER is used by the zone buddy allocator.
+
+log_first_idx
+-------------
+
+Index of the first record stored in the buffer log_buf. Used by
+user-space tools to read the strings in the log_buf.
+
+log_buf
+-------
+
+Console output is written to the ring buffer log_buf at index
+log_first_idx. Used to get the kernel log.
+
+log_buf_len
+-----------
+
+log_buf's length.
+
+clear_idx
+---------
+
+The index that the next printk() record to read after the last clear
+command. It indicates the first record after the last SYSLOG_ACTION
+_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
+the dmesg log.
+
+log_next_idx
+------------
+
+The index of the next record to store in the buffer log_buf. Used to
+compute the index of the current buffer position.
+
+printk_log
+----------
+
+The size of a structure printk_log. Used to compute the size of
+messages, and extract dmesg log. It encapsulates header information for
+log_buf, such as timestamp, syslog level, etc.
+
+(printk_log, ts_nsec|len|text_len|dict_len)
+-------------------------------------------
+
+It represents field offsets in struct printk_log. User space tools
+parse it and check whether the values of printk_log's members have been
+changed.
+
+(free_area.free_list, MIGRATE_TYPES)
+------------------------------------
+
+The number of migrate types for pages. The free_list is described by the
+array. Used by tools to compute the number of free pages.
+
+NR_FREE_PAGES
+-------------
+
+On linux-2.6.21 or later, the number of free pages is in
+vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
+
+PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision
+|PG_head_mask|PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)
+|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
+-----------------------------------------------------------------
+
+Page attributes. These flags are used to filter various unnecessary for
+dumping pages.
+
+HUGETLB_PAGE_DTOR
+-----------------
+
+The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
+excludes these pages.
+
+======
+x86_64
+======
+
+phys_base
+---------
+
+Used to convert the virtual address of an exported kernel symbol to its
+corresponding physical address.
+
+init_top_pgt
+------------
+
+Used to walk through the whole page table and convert virtual addresses
+to physical addresses. The init_top_pgt is somewhat similar to
+swapper_pg_dir, but it is only used in x86_64.
+
+pgtable_l5_enabled
+------------------
+
+User-space tools need to know whether the crash kernel was in 5-level
+paging mode.
+
+node_data
+---------
+
+This is a struct pglist_data array and stores all NUMA nodes
+information. Makedumpfile gets the pglist_data structure from it.
+
+(node_data, MAX_NUMNODES)
+-------------------------
+
+The maximum number of nodes in system.
+
+KERNELOFFSET
+------------
+
+The kernel randomization offset. Used to compute the page offset. If
+KASLR is disabled, this value is zero.
+
+KERNEL_IMAGE_SIZE
+-----------------
+
+Currently unused by Makedumpfile. Used to compute the module virtual
+address by Crash.
+
+sme_mask
+--------
+
+AMD-specific with SME support: it indicates the secure memory encryption
+mask. Makedumpfile tools need to know whether the crash kernel was
+encrypted. If SME is enabled in the first kernel, the crash kernel's
+page table entries (pgd/pud/pmd/pte) contain the memory encryption
+mask. This is used to remove the SME mask and obtain the true physical
+address.
+
+Currently, sme_mask stores the value of the C-bit position. If needed,
+additional SME-relevant info can be placed in that variable.
+
+For example:
+[ misc         ][ enc bit  ][ other misc SME info       ]
+0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000
+63   59   55   51   47   43   39   35   31   27   ... 3
+
+======
+x86_32
+======
+
+X86_PAE
+-------
+
+Denotes whether physical address extensions are enabled. It has the cost
+of a higher page table lookup overhead, and also consumes more page
+table space per process. Used to check whether PAE was enabled in the
+crash kernel when converting virtual addresses to physical addresses.
+
+====
+ia64
+====
+
+pgdat_list|(pgdat_list, MAX_NUMNODES)
+-------------------------------------
+
+pg_data_t array storing all NUMA nodes information. MAX_NUMNODES
+indicates the number of the nodes.
+
+node_memblk|(node_memblk, NR_NODE_MEMBLKS)
+------------------------------------------
+
+List of node memory chunks. Filled when parsing the SRAT table to obtain
+information about memory nodes. NR_NODE_MEMBLKS indicates the number of
+node memory chunks.
+
+These values are used to compute the number of nodes the crashed kernel used.
+
+node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size)
+----------------------------------------------------------------
+
+The size of a struct node_memblk_s and the offsets of the
+node_memblk_s's members. Used to compute the number of nodes.
+
+PGTABLE_3|PGTABLE_4
+-------------------
+
+User-space tools need to know whether the crash kernel was in 3-level or
+4-level paging mode. Used to distinguish the page table.
+
+=====
+ARM64
+=====
+
+VA_BITS
+-------
+
+The maximum number of bits for virtual addresses. Used to compute the
+virtual memory ranges.
+
+kimage_voffset
+--------------
+
+The offset between the kernel virtual and physical mappings. Used to
+translate virtual to physical addresses.
+
+PHYS_OFFSET
+-----------
+
+Indicates the physical address of the start of memory. Similar to
+kimage_voffset, which is used to translate virtual to physical
+addresses.
+
+KERNELOFFSET
+------------
+
+The kernel randomization offset. Used to compute the page offset. If
+KASLR is disabled, this value is zero.
+
+====
+arm
+====
+
+ARM_LPAE
+--------
+
+It indicates whether the crash kernel supports large physical address
+extensions. Used to translate virtual to physical addresses.
+
+====
+s390
+====
+
+lowcore_ptr
+----------
+
+An array with a pointer to the lowcore of every CPU. Used to print the
+psw and all registers information.
+
+high_memory
+-----------
+
+Used to get the vmalloc_start address from the high_memory symbol.
+
+(lowcore_ptr, NR_CPUS)
+----------------------
+
+The maximum number of CPUs.
+
+=======
+powerpc
+=======
+
+
+node_data|(node_data, MAX_NUMNODES)
+-----------------------------------
+
+See above.
+
+contig_page_data
+----------------
+
+See above.
+
+vmemmap_list
+------------
+
+The vmemmap_list maintains the entire vmemmap physical mapping. Used
+to get vmemmap list count and populated vmemmap regions info. If the
+vmemmap address translation information is stored in the crash kernel,
+it is used to translate vmemmap kernel virtual addresses.
+
+mmu_vmemmap_psize
+-----------------
+
+The size of a page. Used to translate virtual to physical addresses.
+
+mmu_psize_defs
+--------------
+
+Page size definitions, i.e. 4k, 64k, or 16M.
+
+Used to make vtop translations.
+
+vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|
+(vmemmap_backing, virt_addr)
+----------------------------------------------------------------
+
+The vmemmap virtual address space management does not have a traditional
+page table to track which virtual struct pages are backed by a physical
+mapping. The virtual to physical mappings are tracked in a simple linked
+list format.
+
+User-space tools need to know the offset of list, phys and virt_addr
+when computing the count of vmemmap regions.
+
+mmu_psize_def|(mmu_psize_def, shift)
+------------------------------------
+
+The size of a struct mmu_psize_def and the offset of mmu_psize_def's
+member.
+
+Used in vtop translations.
+
+==
+sh
+==
+
+node_data|(node_data, MAX_NUMNODES)
+-----------------------------------
+
+See above.
+
+X2TLB
+-----
+
+Indicates whether the crashed kernel enabled SH extended mode.
diff --git a/Documentation/xtensa/booting.txt b/Documentation/xtensa/booting.txt
new file mode 100644 (file)
index 0000000..402b33a
--- /dev/null
@@ -0,0 +1,19 @@
+Passing boot parameters to the kernel.
+
+Boot parameters are represented as a TLV list in the memory. Please see
+arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and
+tag value constants. First entry in the list must have type BP_TAG_FIRST, last
+entry must have type BP_TAG_LAST. The address of the first list entry is
+passed to the kernel in the register a2. The address type depends on MMU type:
+- For configurations without MMU, with region protection or with MPU the
+  address must be the physical address.
+- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n
+  the address must be a valid address in the current mapping. The kernel will
+  not change the mapping on its own.
+- For configurations with MMUv2 the address must be a virtual address in the
+  default virtual mapping (0xd0000000..0xffffffff).
+- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a
+  virtual or physical address. In either case it must be within the default
+  virtual mapping. It is considered physical if it is within the range of
+  physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR..
+  XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual.
index 3e90641e012eb37b8e653e61de53511620989377..fce33cc179b0b5be94bf07c5c47a91255626f079 100644 (file)
@@ -3971,9 +3971,10 @@ M:       Johannes Weiner <hannes@cmpxchg.org>
 L:     cgroups@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:     Maintained
-F:     Documentation/cgroup*
+F:     Documentation/admin-guide/cgroup-v2.rst
+F:     Documentation/cgroup-v1/
 F:     include/linux/cgroup*
-F:     kernel/cgroup*
+F:     kernel/cgroup/
 
 CONTROL GROUP - CPUSET
 M:     Li Zefan <lizefan@huawei.com>
@@ -5948,6 +5949,7 @@ L:        linux-fsdevel@vger.kernel.org
 S:     Maintained
 F:     fs/*
 F:     include/linux/fs.h
+F:     include/linux/fs_types.h
 F:     include/uapi/linux/fs.h
 
 FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
@@ -15556,12 +15558,11 @@ F:    mm/shmem.c
 TOMOYO SECURITY MODULE
 M:     Kentaro Takeda <takedakn@nttdata.co.jp>
 M:     Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
-L:     tomoyo-dev-en@lists.sourceforge.jp (subscribers-only, for developers in English)
-L:     tomoyo-users-en@lists.sourceforge.jp (subscribers-only, for users in English)
-L:     tomoyo-dev@lists.sourceforge.jp (subscribers-only, for developers in Japanese)
-L:     tomoyo-users@lists.sourceforge.jp (subscribers-only, for users in Japanese)
-W:     http://tomoyo.sourceforge.jp/
-T:     quilt http://svn.sourceforge.jp/svnroot/tomoyo/trunk/2.5.x/tomoyo-lsm/patches/
+L:     tomoyo-dev-en@lists.osdn.me (subscribers-only, for developers in English)
+L:     tomoyo-users-en@lists.osdn.me (subscribers-only, for users in English)
+L:     tomoyo-dev@lists.osdn.me (subscribers-only, for developers in Japanese)
+L:     tomoyo-users@lists.osdn.me (subscribers-only, for users in Japanese)
+W:     https://tomoyo.osdn.jp/
 S:     Maintained
 F:     security/tomoyo/
 
index 7deb3ea2dd3fac6335d955b5b457299807efead2..b5dce13a61321693f466f019d22ad962481be70b 100644 (file)
@@ -119,9 +119,6 @@ config GENERIC_HWEIGHT
        bool
        default y
 
-config ARCH_HAS_DMA_SET_COHERENT_MASK
-        bool
-
 config PPC
        bool
        default y
@@ -131,10 +128,10 @@ config PPC
        select ARCH_32BIT_OFF_T if PPC32
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEVMEM_IS_ALLOWED
-       select ARCH_HAS_DMA_SET_COHERENT_MASK
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
+       select ARCH_HAS_KCOV
        select ARCH_HAS_PHYS_TO_DMA
        select ARCH_HAS_PMEM_API                if PPC64
        select ARCH_HAS_PTE_SPECIAL
@@ -203,7 +200,7 @@ config PPC
        select HAVE_IOREMAP_PROT
        select HAVE_IRQ_EXIT_ON_IRQ_STACK
        select HAVE_KERNEL_GZIP
-       select HAVE_KERNEL_XZ                   if PPC_BOOK3S
+       select HAVE_KERNEL_XZ                   if PPC_BOOK3S || 44x
        select HAVE_KPROBES
        select HAVE_KPROBES_ON_FTRACE
        select HAVE_KRETPROBES
@@ -222,7 +219,7 @@ config PPC
        select HAVE_PERF_USER_STACK_DUMP
        select HAVE_RCU_TABLE_FREE              if SMP
        select HAVE_REGS_AND_STACK_ACCESS_API
-       select HAVE_RELIABLE_STACKTRACE         if PPC64 && CPU_LITTLE_ENDIAN
+       select HAVE_RELIABLE_STACKTRACE         if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_VIRT_CPU_ACCOUNTING
        select HAVE_IRQ_TIME_ACCOUNTING
@@ -243,6 +240,7 @@ config PPC
        select RTC_LIB
        select SPARSE_IRQ
        select SYSCTL_EXCEPTION_TRACE
+       select THREAD_INFO_IN_TASK
        select VIRT_TO_BUS                      if !PPC64
        #
        # Please keep this list sorted alphabetically.
@@ -253,9 +251,6 @@ config PPC_BARRIER_NOSPEC
     default y
     depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E
 
-config GENERIC_CSUM
-       def_bool n
-
 config EARLY_PRINTK
        bool
        default y
@@ -475,9 +470,6 @@ config ARCH_CPU_PROBE_RELEASE
 config ARCH_ENABLE_MEMORY_HOTPLUG
        def_bool y
 
-config ARCH_HAS_WALK_MEMORY
-       def_bool y
-
 config ARCH_ENABLE_MEMORY_HOTREMOVE
        def_bool y
 
@@ -693,7 +685,7 @@ config PPC_16K_PAGES
 
 config PPC_64K_PAGES
        bool "64k page size"
-       depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64)
+       depends on 44x || PPC_BOOK3S_64
        select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
@@ -711,6 +703,13 @@ config PPC_256K_PAGES
 
 endchoice
 
+config PPC_PAGE_SHIFT
+       int
+       default 18 if PPC_256K_PAGES
+       default 16 if PPC_64K_PAGES
+       default 14 if PPC_16K_PAGES
+       default 12
+
 config THREAD_SHIFT
        int "Thread shift" if EXPERT
        range 13 15
@@ -721,6 +720,59 @@ config THREAD_SHIFT
          Used to define the stack size. The default is almost always what you
          want. Only change this if you know what you are doing.
 
+config ETEXT_SHIFT_BOOL
+       bool "Set custom etext alignment" if STRICT_KERNEL_RWX && \
+                                            (PPC_BOOK3S_32 || PPC_8xx)
+       depends on ADVANCED_OPTIONS
+       help
+         This option allows you to set the kernel end of text alignment. When
+         RAM is mapped by blocks, the alignment needs to fit the size and
+         number of possible blocks. The default should be OK for most configs.
+
+         Say N here unless you know what you are doing.
+
+config ETEXT_SHIFT
+       int "_etext shift" if ETEXT_SHIFT_BOOL
+       range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
+       range 19 23 if STRICT_KERNEL_RWX && PPC_8xx
+       default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
+       default 19 if STRICT_KERNEL_RWX && PPC_8xx
+       default PPC_PAGE_SHIFT
+       help
+         On Book3S 32 (603+), IBATs are used to map kernel text.
+         Smaller is the alignment, greater is the number of necessary IBATs.
+
+         On 8xx, large pages (512kb or 8M) are used to map kernel linear
+         memory. Aligning to 8M reduces TLB misses as only 8M pages are used
+         in that case.
+
+config DATA_SHIFT_BOOL
+       bool "Set custom data alignment" if STRICT_KERNEL_RWX && \
+                                           (PPC_BOOK3S_32 || PPC_8xx)
+       depends on ADVANCED_OPTIONS
+       help
+         This option allows you to set the kernel data alignment. When
+         RAM is mapped by blocks, the alignment needs to fit the size and
+         number of possible blocks. The default should be OK for most configs.
+
+         Say N here unless you know what you are doing.
+
+config DATA_SHIFT
+       int "Data shift" if DATA_SHIFT_BOOL
+       default 24 if STRICT_KERNEL_RWX && PPC64
+       range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
+       range 19 23 if STRICT_KERNEL_RWX && PPC_8xx
+       default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
+       default 23 if STRICT_KERNEL_RWX && PPC_8xx
+       default PPC_PAGE_SHIFT
+       help
+         On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
+         Smaller is the alignment, greater is the number of necessary DBATs.
+
+         On 8xx, large pages (512kb or 8M) are used to map kernel linear
+         memory. Aligning to 8M reduces TLB misses as only 8M pages are used
+         in that case.
+
 config FORCE_MAX_ZONEORDER
        int "Maximum zone order"
        range 8 9 if PPC64 && PPC_64K_PAGES
@@ -887,6 +939,7 @@ config FSL_SOC
 
 config FSL_PCI
        bool
+       select ARCH_HAS_DMA_SET_MASK
        select PPC_INDIRECT_PCI
        select PCI_QUIRKS
 
index f4961fbcb48d5458a6400a83d0b173fbd6d474fa..4e00cb0a54646706f7485900847fdbacf9a5dec5 100644 (file)
@@ -361,10 +361,6 @@ config PPC_PTDUMP
 
          If you are unsure, say N.
 
-config PPC_HTDUMP
-       def_bool y
-       depends on PPC_PTDUMP && PPC_BOOK3S_64
-
 config PPC_FAST_ENDIAN_SWITCH
        bool "Deprecated fast endian-switch syscall"
         depends on DEBUG_KERNEL && PPC_BOOK3S_64
index 488c9edffa583e7de766262a515ecc1b93540f1e..7de49889bd5dc993864313f13cf61057ee05c52d 100644 (file)
@@ -213,9 +213,9 @@ endif
 asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
 
 KBUILD_CPPFLAGS        += -Iarch/$(ARCH) $(asinstr)
-KBUILD_AFLAGS  += -Iarch/$(ARCH) $(AFLAGS-y)
+KBUILD_AFLAGS  += $(AFLAGS-y)
 KBUILD_CFLAGS  += $(call cc-option,-msoft-float)
-KBUILD_CFLAGS  += -pipe -Iarch/$(ARCH) $(CFLAGS-y)
+KBUILD_CFLAGS  += -pipe $(CFLAGS-y)
 CPP            = $(CC) -E $(KBUILD_CFLAGS)
 
 CHECKFLAGS     += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__
@@ -427,6 +427,13 @@ else
 endif
 endif
 
+ifdef CONFIG_SMP
+prepare: task_cpu_prepare
+
+task_cpu_prepare: prepare0
+       $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h))
+endif
+
 # Check toolchain versions:
 # - gcc-4.6 is the minimum kernel-wide version so nothing required.
 checkbin:
index fb335d05aae8ff87655d56b99c42217650afacd7..1cbc0e4ce857ec252ae6ba06d5c04f35b721847e 100644 (file)
@@ -4,3 +4,4 @@ subdir-y += fsl
 
 dtstree                := $(srctree)/$(src)
 dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
+dtb-$(CONFIG_XILINX_VIRTEX440_GENERIC_BOARD) += virtex440-ml507.dtb virtex440-ml510.dtb
index 8a7a10139bc9fdc05869a6b68dbec8f3e30c7680..cd9d66041a3fee93000debb92d26e07540fb4307 100644 (file)
@@ -40,7 +40,7 @@
                        d-cache-size = <32768>;
                        dcr-controller;
                        dcr-access-method = "native";
-                       status = "ok";
+                       status = "okay";
                };
                cpu@1 {
                        device_type = "cpu";
index b0b26d8d68a28a5af603fcd950829956d9c40a89..64eaf7e09d2251d20b3626f93c97b272d5575e8e 100644 (file)
 
        OCM: ocm@400040000 {
                compatible = "ibm,ocm";
-               status = "ok";
+               status = "okay";
                cell-index = <1>;
                /* configured in U-Boot */
                reg = <4 0x00040000 0x8000>; /* 32K */
index a04a4fcfde637937dbe21ae19e6a0ea0e55c27d2..b6d87b9c2cefb139f389e7c25204c4da2049452d 100644 (file)
@@ -39,7 +39,7 @@
                        d-cache-size = <32768>;
                        dcr-controller;
                        dcr-access-method = "native";
-                       status = "ok";
+                       status = "okay";
                };
                cpu@1 {
                        device_type = "cpu";
index f7063198b2dc6eabdfe829fc6b19a5ab422f8df0..c9f90f1a9c8e93577424e0a078c32a4fbb13cf52 100644 (file)
@@ -43,7 +43,7 @@
                        d-cache-size = <32768>;
                        dcr-controller;
                        dcr-access-method = "native";
-                       status = "ok";
+                       status = "okay";
                };
                cpu@1 {
                        device_type = "cpu";
index 104b1d6d56951a49d0b6589541f0982660f1a358..c406bdb4f36fbd439f572e76dc286494d702fc5e 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/input/input.h>
 
 /*
  * This is commented-out for now.
                                "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3",
                                "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7";
 
+                       interrupt-controller;
+                       #interrupt-cells = <2>;
+                       interrupts = <10>;
+                       interrupt-parent = <&PIC1>;
+
                        /*
                         * This is commented out while a standard binding
                         * for i2c over gpio is defined.
                        panic-indicator;
                };
        };
+
+       gpio-keys {
+               compatible = "gpio-keys";
+
+               power {
+                       label = "Power Button";
+                       gpios = <&GPIO 0 GPIO_ACTIVE_HIGH>;
+                       linux,code = <KEY_POWER>;
+               };
+
+               eject {
+                       label = "Eject Button";
+                       gpios = <&GPIO 6 GPIO_ACTIVE_HIGH>;
+                       linux,code = <KEY_EJECTCD>;
+               };
+       };
 };
 
index 1d911f68a23b3747c268c79034b5af3801cf4435..296584e6dd55a9dfc3ddb97256a7d1cda4a0164a 100644 (file)
@@ -23,8 +23,8 @@
 #include <uapi/asm/ucontext.h>
 
 /* SMP */
-extern struct thread_info *current_set[NR_CPUS];
-extern struct thread_info *secondary_ti;
+extern struct task_struct *current_set[NR_CPUS];
+extern struct task_struct *secondary_current;
 void start_secondary(void *unused);
 
 /* kexec */
@@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image);
 extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
-/* OPAL tracing */
-#ifdef CONFIG_JUMP_LABEL
-extern struct static_key opal_tracepoint_key;
-#endif
 
-void __trace_opal_entry(unsigned long opcode, unsigned long *args);
-void __trace_opal_exit(long opcode, unsigned long retval);
+/* OPAL */
+int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+                   int64_t a4, int64_t a5, int64_t a6, int64_t a7,
+                   int64_t opcode, uint64_t msr);
 
 /* VMX copying */
 int enter_vmx_usercopy(void);
index 0c261ba2c8263b1e8dbb2afe4408511215330f9e..5cb588395fdcfdc7044dcecd0b3bcf9ecd89d73c 100644 (file)
@@ -92,6 +92,8 @@ typedef struct {
        unsigned long vdso_base;
 } mm_context_t;
 
+void update_bats(void);
+
 /* patch sites */
 extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2;
 extern s32 patch__hash_page_B, patch__hash_page_C;
index 49d76adb9bc58a3d931094b7a33450b9f2a8d48d..aa8406b8f7ba08cbd55220603b83ef1fd081d109 100644 (file)
@@ -174,7 +174,18 @@ static inline bool pte_user(pte_t pte)
  * of RAM.  -- Cort
  */
 #define VMALLOC_OFFSET (0x1000000) /* 16M */
+
+/*
+ * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules
+ * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear
+ * memory shall not share segments.
+ */
+#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES)
+#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \
+                      ~(VMALLOC_OFFSET - 1))
+#else
 #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
+#endif
 #define VMALLOC_END    ioremap_bot
 
 #ifndef __ASSEMBLY__
index 247aff9cc6badebac372eec987f2239663b054fc..54b7af6cd27f312194accf08d70e160c212a0848 100644 (file)
 #else
 #define H_PUD_CACHE_INDEX      (H_PUD_INDEX_SIZE)
 #endif
+
 /*
- * Define the address range of the kernel non-linear virtual area
+ * Define the address range of the kernel non-linear virtual area. In contrast
+ * to the linear mapping, this is managed using the kernel page tables and then
+ * inserted into the hash page table to actually take effect, similarly to user
+ * mappings.
  */
 #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000)
-#define H_KERN_VIRT_SIZE  ASM_CONST(0x0000400000000000) /* 64T */
 
 /*
- * The vmalloc space starts at the beginning of that region, and
- * occupies half of it on hash CPUs and a quarter of it on Book3E
- * (we keep a quarter for the virtual memmap)
+ * Allow virtual mapping of one context size.
+ * 512TB for 64K page size
+ * 64TB for 4K page size
+ */
+#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT)
+
+/*
+ * 8TB IO mapping size
+ */
+#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */
+
+/*
+ * The vmalloc space starts at the beginning of the kernel non-linear virtual
+ * region, and occupies 504T (64K) or 56T (4K)
  */
-#define H_VMALLOC_START        H_KERN_VIRT_START
-#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */
-#define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
+#define H_VMALLOC_START H_KERN_VIRT_START
+#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE)
+#define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
 
-#define H_KERN_IO_START        H_VMALLOC_END
+#define H_KERN_IO_START H_VMALLOC_END
 
 /*
  * Region IDs
index 12e522807f9f29b841598f72a910ce8c2100aa48..a28a28079edba94fd2c9a4773137b2f190f8c995 100644 (file)
@@ -23,7 +23,7 @@
  */
 #include <asm/book3s/64/pgtable.h>
 #include <asm/bug.h>
-#include <asm/processor.h>
+#include <asm/task_size_64.h>
 #include <asm/cpu_has_feature.h>
 
 /*
index 9c1173283b96bfd1af86255e98ee7730dcc39f20..138bc2ecc0c4ba51f075a0e8c18d3a69872c9423 100644 (file)
@@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
-       pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS);
+       *pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-       pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS);
+       *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS);
 }
 
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
@@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
                                       pte_t *pte)
 {
-       pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS);
+       *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS);
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                pgtable_t pte_page)
 {
-       pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
+       *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
 }
 
 static inline pgtable_t pmd_pgtable(pmd_t pmd)
index 868fcaf56f6bf20425ae03d5fa42715cffa1c809..581f91be9dd44735cd3626e21585fc9725d13d93 100644 (file)
@@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
        return hash__set_pte_at(mm, addr, ptep, pte, percpu);
 }
 
-#define _PAGE_CACHE_CTL        (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
+#define _PAGE_CACHE_CTL        (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
 
 #define pgprot_noncached pgprot_noncached
 static inline pgprot_t pgprot_noncached(pgprot_t prot)
@@ -851,11 +851,6 @@ static inline bool pte_ci(pte_t pte)
        return false;
 }
 
-static inline void pmd_set(pmd_t *pmdp, unsigned long val)
-{
-       *pmdp = __pmd(val);
-}
-
 static inline void pmd_clear(pmd_t *pmdp)
 {
        *pmdp = __pmd(0);
@@ -887,11 +882,6 @@ static inline int pmd_bad(pmd_t pmd)
        return hash__pmd_bad(pmd);
 }
 
-static inline void pud_set(pud_t *pudp, unsigned long val)
-{
-       *pudp = __pud(val);
-}
-
 static inline void pud_clear(pud_t *pudp)
 {
        *pudp = __pud(0);
@@ -934,10 +924,6 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
 }
 
 #define pgd_write(pgd)         pte_write(pgd_pte(pgd))
-static inline void pgd_set(pgd_t *pgdp, unsigned long val)
-{
-       *pgdp = __pgd(val);
-}
 
 static inline void pgd_clear(pgd_t *pgdp)
 {
index 671316f9e95d289c3ea74e553c1e95e672c4940e..05147cecb8df1c367978dacc9567b1e9fbcef91b 100644 (file)
@@ -13,8 +13,32 @@ static inline int mmu_get_ap(int psize)
 
 #ifdef CONFIG_PPC_RADIX_MMU
 extern void radix__tlbiel_all(unsigned int action);
+extern void radix__flush_tlb_lpid_page(unsigned int lpid,
+                                       unsigned long addr,
+                                       unsigned long page_size);
+extern void radix__flush_pwc_lpid(unsigned int lpid);
+extern void radix__flush_tlb_lpid(unsigned int lpid);
+extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
 #else
 static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
+static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
+                                       unsigned long addr,
+                                       unsigned long page_size)
+{
+       WARN_ON(1);
+}
+static inline void radix__flush_pwc_lpid(unsigned int lpid)
+{
+       WARN_ON(1);
+}
+static inline void radix__flush_tlb_lpid(unsigned int lpid)
+{
+       WARN_ON(1);
+}
+static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+       WARN_ON(1);
+}
 #endif
 
 extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
@@ -49,12 +73,6 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
 extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
 extern void radix__flush_tlb_all(void);
 
-extern void radix__flush_tlb_lpid_page(unsigned int lpid,
-                                       unsigned long addr,
-                                       unsigned long page_size);
-extern void radix__flush_pwc_lpid(unsigned int lpid);
-extern void radix__flush_tlb_lpid(unsigned int lpid);
 extern void radix__local_flush_tlb_lpid(unsigned int lpid);
-extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
 
 #endif
index a78a57e5058d9b6127fe250a42942920a86942fd..72a65d744a2855df344da9df862af6e80c4e945c 100644 (file)
@@ -9,9 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifdef CONFIG_GENERIC_CSUM
-#include <asm-generic/checksum.h>
-#else
 #include <linux/bitops.h>
 #include <linux/in6.h>
 /*
@@ -217,6 +214,5 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum sum);
 
-#endif
 #endif /* __KERNEL__ */
 #endif
index 0245bfcaac324c4ed0f42af59f8c25ab486a2b5e..a130be13ee83b9945b6b0052a80487063ebd1ede 100644 (file)
@@ -19,6 +19,11 @@ struct iommu_table;
  * drivers/macintosh/macio_asic.c
  */
 struct dev_archdata {
+       /*
+        * Set to %true if the dma_iommu_ops are requested to use a direct
+        * window instead of dynamically mapping memory.
+        */
+       bool                    iommu_bypass : 1;
        /*
         * These two used to be a union. However, with the hybrid ops we need
         * both so here we store both a DMA offset for direct mappings and
@@ -33,9 +38,6 @@ struct dev_archdata {
 #ifdef CONFIG_IOMMU_API
        void                    *iommu_domain;
 #endif
-#ifdef CONFIG_SWIOTLB
-       dma_addr_t              max_direct_dma_addr;
-#endif
 #ifdef CONFIG_PPC64
        struct pci_dn           *pci_data;
 #endif
@@ -54,6 +56,4 @@ struct pdev_archdata {
        u64 dma_mask;
 };
 
-#define ARCH_HAS_DMA_GET_REQUIRED_MASK
-
 #endif /* _ASM_POWERPC_DEVICE_H */
index 7702875aabb77d67b634ae5364af71b851cfbc9d..a2912b47102cf3a1f410726e116d060c739f44d7 100644 (file)
@@ -4,26 +4,24 @@
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
-#ifdef CONFIG_SWIOTLB
-       struct dev_archdata *sd = &dev->archdata;
-
-       if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr)
-               return false;
-#endif
-
        if (!dev->dma_mask)
                return false;
 
-       return addr + size - 1 <= *dev->dma_mask;
+       return addr + size - 1 <=
+               min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
 }
 
 static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-       return paddr + get_dma_offset(dev);
+       if (!dev)
+               return paddr + PCI_DRAM_OFFSET;
+       return paddr + dev->archdata.dma_offset;
 }
 
 static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-       return daddr - get_dma_offset(dev);
+       if (!dev)
+               return daddr - PCI_DRAM_OFFSET;
+       return daddr - dev->archdata.dma_offset;
 }
 #endif /* ASM_POWERPC_DMA_DIRECT_H */
index ebf66809f2d35b257d7cdece75ff0e120b45cd88..565d6f74b189cc23dc5b985915da97aedf2c6c09 100644 (file)
@@ -1,74 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2004 IBM
- *
- * Implements the generic device dma API for powerpc.
- * the pci and vio busses
  */
 #ifndef _ASM_DMA_MAPPING_H
 #define _ASM_DMA_MAPPING_H
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/cache.h>
-/* need struct page definitions */
-#include <linux/mm.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-debug.h>
-#include <asm/io.h>
-#include <asm/swiotlb.h>
-
-/* Some dma direct funcs must be visible for use in other dma_ops */
-extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
-                                        dma_addr_t *dma_handle, gfp_t flag,
-                                        unsigned long attrs);
-extern void __dma_nommu_free_coherent(struct device *dev, size_t size,
-                                      void *vaddr, dma_addr_t dma_handle,
-                                      unsigned long attrs);
-extern int dma_nommu_mmap_coherent(struct device *dev,
-                                   struct vm_area_struct *vma,
-                                   void *cpu_addr, dma_addr_t handle,
-                                   size_t size, unsigned long attrs);
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-/*
- * DMA-consistent mapping functions for PowerPCs that don't support
- * cache snooping.  These allocate/free a region of uncached mapped
- * memory space for use with DMA devices.  Alternatively, you could
- * allocate the space "normally" and use the cache management functions
- * to ensure it is consistent.
- */
-struct device;
-extern void __dma_sync(void *vaddr, size_t size, int direction);
-extern void __dma_sync_page(struct page *page, unsigned long offset,
-                                size_t size, int direction);
-extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr);
-
-#else /* ! CONFIG_NOT_COHERENT_CACHE */
-/*
- * Cache coherent cores.
- */
-
-#define __dma_sync(addr, size, rw)             ((void)0)
-#define __dma_sync_page(pg, off, sz, rw)       ((void)0)
-
-#endif /* ! CONFIG_NOT_COHERENT_CACHE */
-
-static inline unsigned long device_to_mask(struct device *dev)
-{
-       if (dev->dma_mask && *dev->dma_mask)
-               return *dev->dma_mask;
-       /* Assume devices without mask can take 32 bit addresses */
-       return 0xfffffffful;
-}
-
-/*
- * Available generic sets of operations
- */
-#ifdef CONFIG_PPC64
-extern struct dma_map_ops dma_iommu_ops;
-#endif
-extern const struct dma_map_ops dma_nommu_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
@@ -80,31 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
        return NULL;
 }
 
-/*
- * get_dma_offset()
- *
- * Get the dma offset on configurations where the dma address can be determined
- * from the physical address by looking at a simple offset.  Direct dma and
- * swiotlb use this function, but it is typically not used by implementations
- * with an iommu.
- */
-static inline dma_addr_t get_dma_offset(struct device *dev)
-{
-       if (dev)
-               return dev->archdata.dma_offset;
-
-       return PCI_DRAM_OFFSET;
-}
-
-static inline void set_dma_offset(struct device *dev, dma_addr_t off)
-{
-       if (dev)
-               dev->archdata.dma_offset = off;
-}
-
-#define HAVE_ARCH_DMA_SET_MASK 1
-
-extern u64 __dma_get_required_mask(struct device *dev);
-
-#endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
index 8b596d096ebef7c1ec70c99aed2709b10390034d..94cfcf33030aeee1d96b9b7e4b068711dfc32ae2 100644 (file)
@@ -219,7 +219,8 @@ struct eeh_ops {
 };
 
 extern int eeh_subsystem_flags;
-extern int eeh_max_freezes;
+extern u32 eeh_max_freezes;
+extern bool eeh_debugfs_no_recover;
 extern struct eeh_ops *eeh_ops;
 extern raw_spinlock_t confirm_error_lock;
 
@@ -293,14 +294,14 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
-int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state);
+int eeh_unfreeze_pe(struct eeh_pe *pe);
 int eeh_pe_reset_and_recover(struct eeh_pe *pe);
 int eeh_dev_open(struct pci_dev *pdev);
 void eeh_dev_release(struct pci_dev *pdev);
 struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group);
 int eeh_pe_set_option(struct eeh_pe *pe, int option);
 int eeh_pe_get_state(struct eeh_pe *pe);
-int eeh_pe_reset(struct eeh_pe *pe, int option);
+int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed);
 int eeh_pe_configure(struct eeh_pe *pe);
 int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
                      unsigned long addr, unsigned long mask);
@@ -460,6 +461,9 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf,
                eeh_check_failure(addr);
 }
 
+
+void eeh_cache_debugfs_init(void);
+
 #endif /* CONFIG_PPC64 */
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_EEH_H */
index 9884e872686f382f2256836afc79bf474d01d99e..6d0412b846ac549e8f49ee73db619a03a7355764 100644 (file)
@@ -33,6 +33,7 @@ struct eeh_event {
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
+int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(struct eeh_pe *pe);
 void eeh_handle_special_event(void);
index 3b4767ed3ec53f04528c1cd0d651d9a55f3f929b..937bb630093f2ee49b7dd7d8552205a48fc6af0b 100644 (file)
@@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define RUNLATCH_ON                            \
 BEGIN_FTR_SECTION                              \
-       CURRENT_THREAD_INFO(r3, r1);            \
+       ld      r3, PACA_THREAD_INFO(r13);      \
        ld      r4,TI_LOCAL_FLAGS(r3);          \
        andi.   r0,r4,_TLF_RUNLATCH;            \
        beql    ppc64_runlatch_on_trampoline;   \
@@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 #ifdef CONFIG_PPC_970_NAP
 #define FINISH_NAP                             \
 BEGIN_FTR_SECTION                              \
-       CURRENT_THREAD_INFO(r11, r1);           \
+       ld      r11, PACA_THREAD_INFO(r13);     \
        ld      r9,TI_LOCAL_FLAGS(r11);         \
        andi.   r10,r9,_TLF_NAPPING;            \
        bnel    power4_fixup_nap;               \
index 3fdc54df63c91654c6d0d027195924578d9c9bb4..464a7519ed6443ef40f9eda29ce738c98d0b6138 100644 (file)
@@ -64,7 +64,7 @@ struct hvsi_priv {
        unsigned int    inbuf_len;      /* data in input buffer */
        unsigned char   inbuf[HVSI_INBUF_SIZE];
        unsigned int    inbuf_cur;      /* Cursor in input buffer */
-       unsigned int    inbuf_pktlen;   /* packet lenght from cursor */
+       unsigned int    inbuf_pktlen;   /* packet length from cursor */
        atomic_t        seqno;          /* packet sequence number */
        unsigned int    opened:1;       /* driver opened */
        unsigned int    established:1;  /* protocol established */
index 17524d222a7b6760c2d785d7c314485c544654ae..0ac52392ed99d021af872f8a23a4527e2418576b 100644 (file)
@@ -237,6 +237,7 @@ static inline void iommu_del_device(struct device *dev)
 }
 #endif /* !CONFIG_IOMMU_API */
 
+u64 dma_iommu_get_required_mask(struct device *dev);
 #else
 
 static inline void *get_iommu_table_base(struct device *dev)
@@ -318,5 +319,21 @@ extern void iommu_release_ownership(struct iommu_table *tbl);
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
+#ifdef CONFIG_PPC_CELL_NATIVE
+extern bool iommu_fixed_is_weak;
+#else
+#define iommu_fixed_is_weak false
+#endif
+
+extern const struct dma_map_ops dma_iommu_ops;
+
+static inline unsigned long device_to_mask(struct device *dev)
+{
+       if (dev->dma_mask && *dev->dma_mask)
+               return *dev->dma_mask;
+       /* Assume devices without mask can take 32 bit addresses */
+       return 0xfffffffful;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
index 3dbd47f2bffeceff929284293bc0ebeffb88f617..abad50a745db46cc326c33bc684fbc1ee6d336f5 100644 (file)
@@ -69,10 +69,7 @@ enum ipic_mcp_irq {
        IPIC_MCP_MU   = 7,
 };
 
-extern void ipic_set_highest_priority(unsigned int irq);
 extern void ipic_set_default_priority(void);
-extern void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq);
-extern void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq);
 extern u32 ipic_get_mcp_status(void);
 extern void ipic_clear_mcp_status(u32 mask);
 
index ee39ce56b2a20e3e812c70fba56d188c109fce85..c91a60cda4faa9c162f5ea2bd519b11ad64dc40c 100644 (file)
@@ -48,23 +48,19 @@ struct pt_regs;
  * Per-cpu stacks for handling critical, debug and machine check
  * level interrupts.
  */
-extern struct thread_info *critirq_ctx[NR_CPUS];
-extern struct thread_info *dbgirq_ctx[NR_CPUS];
-extern struct thread_info *mcheckirq_ctx[NR_CPUS];
-extern void exc_lvl_ctx_init(void);
-#else
-#define exc_lvl_ctx_init()
+extern void *critirq_ctx[NR_CPUS];
+extern void *dbgirq_ctx[NR_CPUS];
+extern void *mcheckirq_ctx[NR_CPUS];
 #endif
 
 /*
  * Per-cpu stacks for handling hard and soft interrupts.
  */
-extern struct thread_info *hardirq_ctx[NR_CPUS];
-extern struct thread_info *softirq_ctx[NR_CPUS];
+extern void *hardirq_ctx[NR_CPUS];
+extern void *softirq_ctx[NR_CPUS];
 
-extern void irq_ctx_init(void);
-extern void call_do_softirq(struct thread_info *tp);
-extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp);
+void call_do_softirq(void *sp);
+void call_do_irq(struct pt_regs *regs, void *sp);
 extern void do_IRQ(struct pt_regs *regs);
 extern void __init init_IRQ(void);
 extern void __do_irq(struct pt_regs *regs);
index eb0d79f0ca45784fdbf1c8ef126cd432b8a9f551..a6c8548ed9faa6c9188d496950ab5ac268756ceb 100644 (file)
@@ -141,6 +141,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
 extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
@@ -632,7 +633,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                             unsigned int yield_count);
 long kvmppc_h_random(struct kvm_vcpu *vcpu);
 void kvmhv_commence_exit(int trap);
-long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
+void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
 void kvmppc_subcore_enter_guest(void);
 void kvmppc_subcore_exit_guest(void);
 long kvmppc_realmode_hmi_handler(void);
index 47a03b9b528b46672ece81b641ebecc13247c817..5070df19d4638e60dc9ede36058f27027e3f9a23 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <linux/module.h>
 #include <linux/ftrace.h>
+#include <linux/sched/task_stack.h>
 
 #ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
@@ -43,13 +44,13 @@ static inline unsigned long klp_get_ftrace_location(unsigned long faddr)
        return ftrace_location_range(faddr, faddr + 16);
 }
 
-static inline void klp_init_thread_info(struct thread_info *ti)
+static inline void klp_init_thread_info(struct task_struct *p)
 {
        /* + 1 to account for STACK_END_MAGIC */
-       ti->livepatch_sp = (unsigned long *)(ti + 1) + 1;
+       task_thread_info(p)->livepatch_sp = end_of_stack(p) + 1;
 }
 #else
-static void klp_init_thread_info(struct thread_info *ti) { }
+static inline void klp_init_thread_info(struct task_struct *p) { }
 #endif /* CONFIG_LIVEPATCH */
 
 #endif /* _ASM_POWERPC_LIVEPATCH_H */
index 8311869005fa8d4769020c7aebdddf1335da415d..2f0ca6560e47c8ff7445c998de2e376fc4f93fb3 100644 (file)
@@ -47,9 +47,7 @@ struct machdep_calls {
 #endif
 #endif /* CONFIG_PPC64 */
 
-       /* Platform set_dma_mask and dma_get_required_mask overrides */
-       int             (*dma_set_mask)(struct device *dev, u64 dma_mask);
-       u64             (*dma_get_required_mask)(struct device *dev);
+       void            (*dma_set_mask)(struct device *dev, u64 dma_mask);
 
        int             (*probe)(void);
        void            (*setup_arch)(void); /* Optional, may be NULL */
index a8b8903e184408b4623d842d9dc9f457639cccd3..17996bc9382b447e76dd212caad62fd4d97705eb 100644 (file)
@@ -209,7 +209,7 @@ extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
-                                          bool user_mode);
+                                          bool user_mode, bool in_guest);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
index 25607604a7a5b0cbca4eea5573ee202997afdb3b..d34ad1657d7b2c44cdb16683bfef359b651fd02d 100644 (file)
@@ -289,6 +289,17 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
 }
 #endif /* CONFIG_PPC_MEM_KEYS */
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static inline bool strict_kernel_rwx_enabled(void)
+{
+       return rodata_enabled;
+}
+#else
+static inline bool strict_kernel_rwx_enabled(void)
+{
+       return false;
+}
+#endif
 #endif /* !__ASSEMBLY__ */
 
 /* The kernel use the constants below to index in the page sizes array.
@@ -356,6 +367,8 @@ extern void early_init_mmu_secondary(void);
 extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
                                       phys_addr_t first_memblock_size);
 static inline void mmu_early_init_devtree(void) { }
+
+extern void *abatron_pteptrs[2];
 #endif /* __ASSEMBLY__ */
 #endif
 
index bd9ba8defd7258ab6e853be0c39d7290f9f02393..84b4cfe73edddec59abd875d6e807dce7c251a2e 100644 (file)
@@ -14,4 +14,6 @@ extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
 #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
 #endif
 
+extern void hv_nmi_check_nonrecoverable(struct pt_regs *regs);
+
 #endif /* _ASM_NMI_H */
index b0f764c827c0f4c8606d7a934eb20d331ec9003e..0a1a3fc54e540597f6440dad16670eeebb3eb254 100644 (file)
@@ -231,9 +231,10 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 }
 
 /* patch sites */
-extern s32 patch__itlbmiss_linmem_top;
+extern s32 patch__itlbmiss_linmem_top, patch__itlbmiss_linmem_top8;
 extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp;
 extern s32 patch__fixupdar_linmem_top;
+extern s32 patch__dtlbmiss_romem_top, patch__dtlbmiss_romem_top8;
 
 extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2;
 extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3;
index 5c5ea24134133bcbb84f78a33ae1bfc304aedf8e..ed870468ef6f12562f7833fc8af56bb24d44cc85 100644 (file)
 
 /*
  * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages
- * on PPC44x). For PPC64 we support either 4K or 64K software
+ * on PPC44x and 4K/16K on 8xx). For PPC64 we support either 4K or 64K software
  * page size. When using 64K pages however, whether we are really supporting
  * 64K pages in HW or not is irrelevant to those definitions.
  */
-#if defined(CONFIG_PPC_256K_PAGES)
-#define PAGE_SHIFT             18
-#elif defined(CONFIG_PPC_64K_PAGES)
-#define PAGE_SHIFT             16
-#elif defined(CONFIG_PPC_16K_PAGES)
-#define PAGE_SHIFT             14
-#else
-#define PAGE_SHIFT             12
-#endif
-
+#define PAGE_SHIFT             CONFIG_PPC_PAGE_SHIFT
 #define PAGE_SIZE              (ASM_CONST(1) << PAGE_SHIFT)
 
 #ifndef __ASSEMBLY__
@@ -326,7 +317,6 @@ struct page;
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr,
                struct page *p);
-extern int page_is_ram(unsigned long pfn);
 extern int devmem_is_allowed(unsigned long pfn);
 
 #ifdef CONFIG_PPC_SMLPAR
index 77fc21278fa2ad5ba944a7e825bddf8b47caf31b..fc188e0e91791a80c87d634b2b7a9fc8a4f8b72f 100644 (file)
@@ -20,6 +20,8 @@ struct device_node;
 struct pci_controller_ops {
        void            (*dma_dev_setup)(struct pci_dev *pdev);
        void            (*dma_bus_setup)(struct pci_bus *bus);
+       bool            (*iommu_bypass_supported)(struct pci_dev *pdev,
+                               u64 mask);
 
        int             (*probe_mode)(struct pci_bus *bus);
 
@@ -44,9 +46,6 @@ struct pci_controller_ops {
        void            (*teardown_msi_irqs)(struct pci_dev *pdev);
 #endif
 
-       int             (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
-       u64             (*dma_get_required_mask)(struct pci_dev *pdev);
-
        void            (*shutdown)(struct pci_controller *hose);
 };
 
@@ -275,6 +274,8 @@ extern int pcibios_map_io_space(struct pci_bus *bus);
 extern struct pci_controller *pci_find_hose_for_OF_device(
                        struct device_node* node);
 
+extern struct pci_controller *pci_find_controller_for_domain(int domain_nr);
+
 /* Fill up host controller resources from the OF node */
 extern void pci_process_bridge_OF_ranges(struct pci_controller *hose,
                        struct device_node *dev, int primary);
index 0c72f1897063662caf1d1416a881e079badd0a32..6a1861a6301e4361f64e6eaa2275afcfe5e596c4 100644 (file)
@@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 
 #ifdef CONFIG_PCI
 extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops);
-extern const struct dma_map_ops *get_pci_dma_ops(void);
 #else  /* CONFIG_PCI */
 #define set_pci_dma_ops(d)
-#define get_pci_dma_ops()      NULL
 #endif
 
 #ifdef CONFIG_PPC64
index dad1d27e196d92f0e1a5ca5a2194e91af70197ed..505550fb293566d76c0fddebcc2ecf0c5ed0fc05 100644 (file)
@@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[];
 
 extern pgd_t swapper_pg_dir[];
 
-int dma_pfn_limit_to_zone(u64 pfn_limit);
 extern void paging_init(void);
 
 /*
index 2f3ff7a278815a131f9ebec73bffcdaedf0abab3..05b5524185198698e55e01e817d89d7f3402727e 100644 (file)
@@ -23,6 +23,8 @@ extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
                                unsigned long *flags, unsigned long *status,
                                int count);
 
+void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val);
+
 void pnv_tm_init(void);
 #else
 static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
@@ -40,7 +42,6 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context,
 }
 
 static inline void pnv_tm_init(void) { }
-static inline void pnv_power9_force_smt4(void) { }
 #endif
 
 #endif /* _ASM_POWERNV_H */
index f9513ad38fa64f62d0093b9fb9962418c4697e26..c5698a523bb189dee5650398603c4ac8a0f5bc27 100644 (file)
 #define PPC_INST_ADDI                  0x38000000
 #define PPC_INST_ADDIS                 0x3c000000
 #define PPC_INST_ADD                   0x7c000214
+#define PPC_INST_ADDC                  0x7c000014
 #define PPC_INST_SUB                   0x7c000050
 #define PPC_INST_BLR                   0x4e800020
 #define PPC_INST_BLRL                  0x4e800021
 #define PPC_INST_MULLW                 0x7c0001d6
 #define PPC_INST_MULHWU                        0x7c000016
 #define PPC_INST_MULLI                 0x1c000000
+#define PPC_INST_MADDHD                        0x10000030
+#define PPC_INST_MADDHDU               0x10000031
+#define PPC_INST_MADDLD                        0x10000033
 #define PPC_INST_DIVWU                 0x7c000396
 #define PPC_INST_DIVD                  0x7c0003d2
 #define PPC_INST_RLWINM                        0x54000000
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)   (((a) & 0x1f) << 16)
 #define ___PPC_RB(b)   (((b) & 0x1f) << 11)
+#define ___PPC_RC(c)   (((c) & 0x1f) << 6)
 #define ___PPC_RS(s)   (((s) & 0x1f) << 21)
 #define ___PPC_RT(t)   ___PPC_RS(t)
 #define ___PPC_R(r)    (((r) & 0x1) << 16)
 #define __PPC_WS(w)    (((w) & 0x1f) << 11)
 #define __PPC_SH(s)    __PPC_WS(s)
 #define __PPC_SH64(s)  (__PPC_SH(s) | (((s) & 0x20) >> 4))
-#define __PPC_MB(s)    (((s) & 0x1f) << 6)
+#define __PPC_MB(s)    ___PPC_RC(s)
 #define __PPC_ME(s)    (((s) & 0x1f) << 1)
 #define __PPC_MB64(s)  (__PPC_MB(s) | ((s) & 0x20))
 #define __PPC_ME64(s)  __PPC_MB64(s)
 #define PPC_STQCX(t, a, b)     stringify_in_c(.long PPC_INST_STQCX | \
                                        ___PPC_RT(t) | ___PPC_RA(a) | \
                                        ___PPC_RB(b))
+#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \
+                                       ___PPC_RT(t) | ___PPC_RA(a)  | \
+                                       ___PPC_RB(b) | ___PPC_RC(c))
+#define PPC_MADDHDU(t, a, b, c)        stringify_in_c(.long PPC_INST_MADDHDU | \
+                                       ___PPC_RT(t) | ___PPC_RA(a)   | \
+                                       ___PPC_RB(b) | ___PPC_RC(c))
+#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \
+                                       ___PPC_RT(t) | ___PPC_RA(a)  | \
+                                       ___PPC_RB(b) | ___PPC_RC(c))
 #define PPC_MSGSND(b)          stringify_in_c(.long PPC_INST_MSGSND | \
                                        ___PPC_RB(b))
 #define PPC_MSGSYNC            stringify_in_c(.long PPC_INST_MSGSYNC)
index f67da277d6526635eea2e34436a5fbe995b1b7a6..f191ef0d2a0a5ce48c161613760581ef0f0eda7e 100644 (file)
@@ -53,13 +53,13 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
-int eeh_pe_reset_full(struct eeh_pe *pe);
+int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
 void eeh_pe_mark_isolated(struct eeh_pe *pe);
-void eeh_pe_state_clear(struct eeh_pe *pe, int state);
+void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed);
 void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state);
 void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 
index ee58526cb6c276d20c029ccab1fddf7d5575c4fb..3351bcf42f2dbea4d747518205fd8a3fce214766 100644 (file)
@@ -40,7 +40,7 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-#include <asm/thread_info.h>
+#include <linux/thread_info.h>
 #include <asm/ptrace.h>
 #include <asm/hw_breakpoint.h>
 
@@ -77,105 +77,15 @@ extern int _chrp_type;
 
 #ifdef __KERNEL__
 
-struct task_struct;
-void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
-void release_thread(struct task_struct *);
-
-#ifdef CONFIG_PPC32
-
-#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START
-#error User TASK_SIZE overlaps with KERNEL_START address
-#endif
-#define TASK_SIZE      (CONFIG_TASK_SIZE)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE     (TASK_SIZE / 8 * 3)
-#endif
-
 #ifdef CONFIG_PPC64
-/*
- * 64-bit user address space can have multiple limits
- * For now supported values are:
- */
-#define TASK_SIZE_64TB  (0x0000400000000000UL)
-#define TASK_SIZE_128TB (0x0000800000000000UL)
-#define TASK_SIZE_512TB (0x0002000000000000UL)
-#define TASK_SIZE_1PB   (0x0004000000000000UL)
-#define TASK_SIZE_2PB   (0x0008000000000000UL)
-/*
- * With 52 bits in the address we can support
- * upto 4PB of range.
- */
-#define TASK_SIZE_4PB   (0x0010000000000000UL)
-
-/*
- * For now 512TB is only supported with book3s and 64K linux page size.
- */
-#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
-/*
- * Max value currently used:
- */
-#define TASK_SIZE_USER64               TASK_SIZE_4PB
-#define DEFAULT_MAP_WINDOW_USER64      TASK_SIZE_128TB
-#define TASK_CONTEXT_SIZE              TASK_SIZE_512TB
+#include <asm/task_size_64.h>
 #else
-#define TASK_SIZE_USER64               TASK_SIZE_64TB
-#define DEFAULT_MAP_WINDOW_USER64      TASK_SIZE_64TB
-/*
- * We don't need to allocate extended context ids for 4K page size, because
- * we limit the max effective address on this config to 64TB.
- */
-#define TASK_CONTEXT_SIZE              TASK_SIZE_64TB
+#include <asm/task_size_32.h>
 #endif
 
-/*
- * 32-bit user address space is 4GB - 1 page
- * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
- */
-#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
-
-#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
-               TASK_SIZE_USER32 : TASK_SIZE_USER64)
-#define TASK_SIZE        TASK_SIZE_OF(current)
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
-#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4))
-
-#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
-               TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
-#endif
-
-/*
- * Initial task size value for user applications. For book3s 64 we start
- * with 128TB and conditionally enable upto 512TB
- */
-#ifdef CONFIG_PPC_BOOK3S_64
-#define DEFAULT_MAP_WINDOW     ((is_32bit_task()) ?                    \
-                                TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64)
-#else
-#define DEFAULT_MAP_WINDOW     TASK_SIZE
-#endif
-
-#ifdef __powerpc64__
-
-#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64
-#define STACK_TOP_USER32 TASK_SIZE_USER32
-
-#define STACK_TOP (is_32bit_task() ? \
-                  STACK_TOP_USER32 : STACK_TOP_USER64)
-
-#define STACK_TOP_MAX TASK_SIZE_USER64
-
-#else /* __powerpc64__ */
-
-#define STACK_TOP TASK_SIZE
-#define STACK_TOP_MAX  STACK_TOP
-
-#endif /* __powerpc64__ */
+struct task_struct;
+void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
+void release_thread(struct task_struct *);
 
 typedef struct {
        unsigned long seg;
@@ -250,6 +160,9 @@ struct thread_struct {
 #ifdef CONFIG_PPC32
        void            *pgdir;         /* root of page-table tree */
        unsigned long   ksp_limit;      /* if ksp <= ksp_limit stack overflow */
+#ifdef CONFIG_PPC_RTAS
+       unsigned long   rtas_sp;        /* stack pointer for when in RTAS */
+#endif
 #endif
        /* Debug Registers */
        struct debug_reg debug;
@@ -357,8 +270,7 @@ struct thread_struct {
 #define ARCH_MIN_TASKALIGN 16
 
 #define INIT_SP                (sizeof(init_stack) + (unsigned long) &init_stack)
-#define INIT_SP_LIMIT \
-       (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack)
+#define INIT_SP_LIMIT  ((unsigned long)&init_stack)
 
 #ifdef CONFIG_SPE
 #define SPEFSCR_INIT \
index 0b8a735b6d85f08512143b539c5ee5329598c48c..64271e562fed324fe25b8b2743c34ac5ee1fd390 100644 (file)
@@ -157,7 +157,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno,
                          unsigned long data);
 
 #define current_pt_regs() \
-       ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1)
+       ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1)
 /*
  * We use the least-significant bit of the trap field to indicate
  * whether we have saved the full set of registers, or only a
index 1c98ef1f2d5b14c4347b5b27f648d611016dd15e..c5b2aff0ce8e1b5c5980028de4e0c9965f6069b0 100644 (file)
  *     - SPRG9 debug exception scratch
  *
  * All 32-bit:
- *     - SPRG3 current thread_info pointer
+ *     - SPRG3 current thread_struct physical addr pointer
  *        (virtual on BookE, physical on others)
  *
  * 32-bit classic:
 #ifdef CONFIG_PPC_BOOK3S_32
 #define SPRN_SPRG_SCRATCH0     SPRN_SPRG0
 #define SPRN_SPRG_SCRATCH1     SPRN_SPRG1
-#define SPRN_SPRG_RTAS         SPRN_SPRG2
+#define SPRN_SPRG_PGDIR                SPRN_SPRG2
 #define SPRN_SPRG_603_LRU      SPRN_SPRG4
 #endif
 
@@ -1425,6 +1425,11 @@ static inline void msr_check_and_clear(unsigned long bits)
 #define mfsrin(v)      ({unsigned int rval; \
                        asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \
                                        rval;})
+
+static inline void mtsrin(u32 val, u32 idx)
+{
+       asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx));
+}
 #endif
 
 #define proc_trap()    asm volatile("trap")
index e335a8f846afdd0d8388e782b02c5dba563bf206..4a1664a8658d7b4c1a3f0b70c50ec69b6363cdae 100644 (file)
@@ -17,6 +17,13 @@ extern char __end_interrupts[];
 extern char __prom_init_toc_start[];
 extern char __prom_init_toc_end[];
 
+#ifdef CONFIG_PPC_POWERNV
+extern char start_real_trampolines[];
+extern char end_real_trampolines[];
+extern char start_virt_trampolines[];
+extern char end_virt_trampolines[];
+#endif
+
 static inline int in_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end)
index 41695745032cd6625ec0660edea8a1cdf144840c..0de717e16dd6d06a1de9de594164e237ce690d11 100644 (file)
@@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu);
 /* 32-bit */
 extern int smp_hw_index[];
 
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+/*
+ * This is particularly ugly: it appears we can't actually get the definition
+ * of task_struct here, but we need access to the CPU this task is running on.
+ * Instead of using task_struct we're using _TASK_CPU which is extracted from
+ * asm-offsets.h by kbuild to get the current processor ID.
+ *
+ * This also needs to be safeguarded when building asm-offsets.s because at
+ * that time _TASK_CPU is not defined yet. It could have been guarded by
+ * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing
+ * when building something else than asm-offsets.s
+ */
+#ifdef GENERATING_ASM_OFFSETS
+#define raw_smp_processor_id()         (0)
+#else
+#define raw_smp_processor_id()         (*(unsigned int *)((void *)current + _TASK_CPU))
+#endif
 #define hard_smp_processor_id()        (smp_hw_index[smp_processor_id()])
 
 static inline int get_hard_smp_processor_id(int cpu)
index f65ecf57b66c0a6feaaca6708daab0087a25b94c..b7d082c0ec25f7eadd5664cd36a190ac98659ad2 100644 (file)
 
 #include <linux/swiotlb.h>
 
-extern const struct dma_map_ops powerpc_swiotlb_dma_ops;
-
 extern unsigned int ppc_swiotlb_enable;
-int __init swiotlb_setup_bus_notifier(void);
-
-extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev);
 
 #ifdef CONFIG_SWIOTLB
 void swiotlb_detect_4g(void);
diff --git a/arch/powerpc/include/asm/task_size_32.h b/arch/powerpc/include/asm/task_size_32.h
new file mode 100644 (file)
index 0000000..de7290e
--- /dev/null
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_TASK_SIZE_32_H
+#define _ASM_POWERPC_TASK_SIZE_32_H
+
+#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START
+#error User TASK_SIZE overlaps with KERNEL_START address
+#endif
+
+#define TASK_SIZE (CONFIG_TASK_SIZE)
+
+/*
+ * This decides where the kernel will search for a free chunk of vm space during
+ * mmap's.
+ */
+#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3)
+
+#define DEFAULT_MAP_WINDOW TASK_SIZE
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#endif /* _ASM_POWERPC_TASK_SIZE_32_H */
diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h
new file mode 100644 (file)
index 0000000..eab4779
--- /dev/null
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_TASK_SIZE_64_H
+#define _ASM_POWERPC_TASK_SIZE_64_H
+
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB  (0x0000400000000000UL)
+#define TASK_SIZE_128TB (0x0000800000000000UL)
+#define TASK_SIZE_512TB (0x0002000000000000UL)
+#define TASK_SIZE_1PB   (0x0004000000000000UL)
+#define TASK_SIZE_2PB   (0x0008000000000000UL)
+
+/*
+ * With 52 bits in the address we can support up to 4PB of range.
+ */
+#define TASK_SIZE_4PB   (0x0010000000000000UL)
+
+/*
+ * For now 512TB is only supported with book3s and 64K linux page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
+/*
+ * Max value currently used:
+ */
+#define TASK_SIZE_USER64               TASK_SIZE_4PB
+#define DEFAULT_MAP_WINDOW_USER64      TASK_SIZE_128TB
+#define TASK_CONTEXT_SIZE              TASK_SIZE_512TB
+#else
+#define TASK_SIZE_USER64               TASK_SIZE_64TB
+#define DEFAULT_MAP_WINDOW_USER64      TASK_SIZE_64TB
+
+/*
+ * We don't need to allocate extended context ids for 4K page size, because we
+ * limit the max effective address on this config to 64TB.
+ */
+#define TASK_CONTEXT_SIZE TASK_SIZE_64TB
+#endif
+
+/*
+ * 32-bit user address space is 4GB - 1 page
+ * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
+ */
+#define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE))
+
+#define TASK_SIZE_OF(tsk)                                              \
+       (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 :      \
+                                               TASK_SIZE_USER64)
+
+#define TASK_SIZE TASK_SIZE_OF(current)
+
+#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
+#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4))
+
+/*
+ * This decides where the kernel will search for a free chunk of vm space during
+ * mmap's.
+ */
+#define TASK_UNMAPPED_BASE     \
+       ((is_32bit_task()) ? TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64)
+
+/*
+ * Initial task size value for user applications. For book3s 64 we start
+ * with 128TB and conditionally enable upto 512TB
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+#define DEFAULT_MAP_WINDOW     \
+       ((is_32bit_task()) ? TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64)
+#else
+#define DEFAULT_MAP_WINDOW     TASK_SIZE
+#endif
+
+#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64
+#define STACK_TOP_USER32 TASK_SIZE_USER32
+#define STACK_TOP_MAX TASK_SIZE_USER64
+#define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64)
+
+#endif /* _ASM_POWERPC_TASK_SIZE_64_H */
index 544cac0474cbcca12b96ef9f19406ada562722a2..8e1d0195ac36632ccf9c001a2a91b0e3527dba77 100644 (file)
 
 #define THREAD_SIZE            (1 << THREAD_SHIFT)
 
-#ifdef CONFIG_PPC64
-#define CURRENT_THREAD_INFO(dest, sp)  stringify_in_c(clrrdi dest, sp, THREAD_SHIFT)
-#else
-#define CURRENT_THREAD_INFO(dest, sp)  stringify_in_c(rlwinm dest, sp, 0, 0, 31-THREAD_SHIFT)
-#endif
-
 #ifndef __ASSEMBLY__
 #include <linux/cache.h>
 #include <asm/processor.h>
@@ -34,8 +28,6 @@
  * low level task data.
  */
 struct thread_info {
-       struct task_struct *task;               /* main task structure */
-       int             cpu;                    /* cpu we're on */
        int             preempt_count;          /* 0 => preemptable,
                                                   <0 => BUG */
        unsigned long   local_flags;            /* private flags for thread */
@@ -58,8 +50,6 @@ struct thread_info {
  */
 #define INIT_THREAD_INFO(tsk)                  \
 {                                              \
-       .task =         &tsk,                   \
-       .cpu =          0,                      \
        .preempt_count = INIT_PREEMPT_COUNT,    \
        .flags =        0,                      \
 }
@@ -67,15 +57,6 @@ struct thread_info {
 #define THREAD_SIZE_ORDER      (THREAD_SHIFT - PAGE_SHIFT)
 
 /* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
-{
-       unsigned long val;
-
-       asm (CURRENT_THREAD_INFO(%0,1) : "=r" (val));
-
-       return (struct thread_info *)val;
-}
-
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 
 #ifdef CONFIG_PPC_BOOK3S_64
index a4a718dbfec6e0e4dbcfbf80b020f54689258290..f85e2b01c3df2b686cba72440b4b55415d8dbfc3 100644 (file)
@@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {}
 #define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)     (per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)          (cpu_to_core_id(cpu))
+
+int dlpar_cpu_readd(int cpu);
 #endif
 #endif
 
index cb7f0bb9ee7122a7f08a6d83ad957e511899527a..cddadccf551d9ec018a68d37c9ab8db96a4548d7 100644 (file)
@@ -36,7 +36,7 @@ obj-y                         := cputable.o ptrace.o syscalls.o \
                                   process.o systbl.o idle.o \
                                   signal.o sysfs.o cacheinfo.o time.o \
                                   prom.o traps.o setup-common.o \
-                                  udbg.o misc.o io.o dma.o misc_$(BITS).o \
+                                  udbg.o misc.o io.o misc_$(BITS).o \
                                   of_platform.o prom_parse.o
 obj-$(CONFIG_PPC64)            += setup_64.o sys_ppc32.o \
                                   signal_64.o ptrace32.o \
@@ -105,6 +105,7 @@ obj-$(CONFIG_UPROBES)               += uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)   += legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)       += stacktrace.o
 obj-$(CONFIG_SWIOTLB)          += dma-swiotlb.o
+obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
 
 pci64-$(CONFIG_PPC64)          += pci_dn.o pci-hotplug.o isa-bridge.o
 obj-$(CONFIG_PCI)              += pci_$(BITS).o $(pci64-y) \
@@ -142,19 +143,29 @@ endif
 obj-$(CONFIG_EPAPR_PARAVIRT)   += epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)                += kvm.o kvm_emul.o
 
-# Disable GCOV & sanitizers in odd or sensitive code
+# Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
+KCOV_INSTRUMENT_prom_init.o := n
 UBSAN_SANITIZE_prom_init.o := n
 GCOV_PROFILE_machine_kexec_64.o := n
+KCOV_INSTRUMENT_machine_kexec_64.o := n
 UBSAN_SANITIZE_machine_kexec_64.o := n
 GCOV_PROFILE_machine_kexec_32.o := n
+KCOV_INSTRUMENT_machine_kexec_32.o := n
 UBSAN_SANITIZE_machine_kexec_32.o := n
 GCOV_PROFILE_kprobes.o := n
+KCOV_INSTRUMENT_kprobes.o := n
 UBSAN_SANITIZE_kprobes.o := n
 GCOV_PROFILE_kprobes-ftrace.o := n
+KCOV_INSTRUMENT_kprobes-ftrace.o := n
 UBSAN_SANITIZE_kprobes-ftrace.o := n
 UBSAN_SANITIZE_vdso.o := n
 
+# Necessary for booting with kcov enabled on book3e machines
+KCOV_INSTRUMENT_cputable.o := n
+KCOV_INSTRUMENT_setup_64.o := n
+KCOV_INSTRUMENT_paca.o := n
+
 extra-$(CONFIG_PPC_FPU)                += fpu.o
 extra-$(CONFIG_ALTIVEC)                += vector.o
 extra-$(CONFIG_PPC64)          += entry_64.o
index 9ffc72ded73add59be3f267e0f8cfc922bdf827b..86a61e5f8285bfb237abdcbb44d71df2174792a6 100644 (file)
@@ -13,6 +13,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#define GENERATING_ASM_OFFSETS /* asm/smp.h */
+
 #include <linux/compat.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
@@ -90,10 +92,15 @@ int main(void)
        DEFINE(SIGSEGV, SIGSEGV);
        DEFINE(NMI_MASK, NMI_MASK);
 #else
-       OFFSET(THREAD_INFO, task_struct, stack);
-       DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
        OFFSET(KSP_LIMIT, thread_struct, ksp_limit);
+#ifdef CONFIG_PPC_RTAS
+       OFFSET(RTAS_SP, thread_struct, rtas_sp);
+#endif
 #endif /* CONFIG_PPC64 */
+       OFFSET(TASK_STACK, task_struct, stack);
+#ifdef CONFIG_SMP
+       OFFSET(TASK_CPU, task_struct, cpu);
+#endif
 
 #ifdef CONFIG_LIVEPATCH
        OFFSET(TI_livepatch_sp, thread_info, livepatch_sp);
@@ -161,8 +168,6 @@ int main(void)
        OFFSET(TI_FLAGS, thread_info, flags);
        OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
        OFFSET(TI_PREEMPT, thread_info, preempt_count);
-       OFFSET(TI_TASK, thread_info, task);
-       OFFSET(TI_CPU, thread_info, cpu);
 
 #ifdef CONFIG_PPC64
        OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size);
@@ -177,6 +182,8 @@ int main(void)
        OFFSET(PACAPROCSTART, paca_struct, cpu_start);
        OFFSET(PACAKSAVE, paca_struct, kstack);
        OFFSET(PACACURRENT, paca_struct, __current);
+       DEFINE(PACA_THREAD_INFO, offsetof(struct paca_struct, __current) +
+                                offsetof(struct task_struct, thread_info));
        OFFSET(PACASAVEDMSR, paca_struct, saved_msr);
        OFFSET(PACAR1, paca_struct, saved_r1);
        OFFSET(PACATOC, paca_struct, kernel_toc);
index 8c069e96c47893ffe79117815174aab719e5d948..6f1c11e0691f2dd937f77861f08a0778f91e8a30 100644 (file)
@@ -24,6 +24,10 @@ BEGIN_MMU_FTR_SECTION
        li      r10,0
        mtspr   SPRN_SPRG_603_LRU,r10           /* init SW LRU tracking */
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
+       lis     r10, (swapper_pg_dir - PAGE_OFFSET)@h
+       ori     r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l
+       mtspr   SPRN_SPRG_PGDIR, r10
+
 BEGIN_FTR_SECTION
        bl      __init_fpu_registers
 END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE)
index 9c9bcaae2f759921569001ad7b4474a5f66faecc..09231ef06d01e71645cd92722646b56621c42b9c 100644 (file)
@@ -6,12 +6,31 @@
  * busses using the iommu infrastructure
  */
 
+#include <linux/dma-direct.h>
+#include <linux/pci.h>
 #include <asm/iommu.h>
 
 /*
  * Generic iommu implementation
  */
 
+/*
+ * The coherent mask may be smaller than the real mask, check if we can
+ * really use a direct window.
+ */
+static inline bool dma_iommu_alloc_bypass(struct device *dev)
+{
+       return dev->archdata.iommu_bypass && !iommu_fixed_is_weak &&
+               dma_direct_supported(dev, dev->coherent_dma_mask);
+}
+
+static inline bool dma_iommu_map_bypass(struct device *dev,
+               unsigned long attrs)
+{
+       return dev->archdata.iommu_bypass &&
+               (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING));
+}
+
 /* Allocates a contiguous real buffer and creates mappings over it.
  * Returns the virtual address of the buffer and sets dma_handle
  * to the dma address (mapping) of the first page.
@@ -20,6 +39,8 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
                                      dma_addr_t *dma_handle, gfp_t flag,
                                      unsigned long attrs)
 {
+       if (dma_iommu_alloc_bypass(dev))
+               return dma_direct_alloc(dev, size, dma_handle, flag, attrs);
        return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
                                    dma_handle, dev->coherent_dma_mask, flag,
                                    dev_to_node(dev));
@@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size,
                                    void *vaddr, dma_addr_t dma_handle,
                                    unsigned long attrs)
 {
-       iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
+       if (dma_iommu_alloc_bypass(dev))
+               dma_direct_free(dev, size, vaddr, dma_handle, attrs);
+       else
+               iommu_free_coherent(get_iommu_table_base(dev), size, vaddr,
+                               dma_handle);
 }
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
@@ -42,6 +67,9 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
                                     enum dma_data_direction direction,
                                     unsigned long attrs)
 {
+       if (dma_iommu_map_bypass(dev, attrs))
+               return dma_direct_map_page(dev, page, offset, size, direction,
+                               attrs);
        return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
                              size, device_to_mask(dev), direction, attrs);
 }
@@ -51,8 +79,9 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
                                 size_t size, enum dma_data_direction direction,
                                 unsigned long attrs)
 {
-       iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction,
-                        attrs);
+       if (!dma_iommu_map_bypass(dev, attrs))
+               iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size,
+                               direction,  attrs);
 }
 
 
@@ -60,6 +89,8 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                            int nelems, enum dma_data_direction direction,
                            unsigned long attrs)
 {
+       if (dma_iommu_map_bypass(dev, attrs))
+               return dma_direct_map_sg(dev, sglist, nelems, direction, attrs);
        return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems,
                                device_to_mask(dev), direction, attrs);
 }
@@ -68,10 +99,20 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist,
                int nelems, enum dma_data_direction direction,
                unsigned long attrs)
 {
-       ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems,
+       if (!dma_iommu_map_bypass(dev, attrs))
+               ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems,
                           direction, attrs);
 }
 
+static bool dma_iommu_bypass_supported(struct device *dev, u64 mask)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+
+       return phb->controller_ops.iommu_bypass_supported &&
+               phb->controller_ops.iommu_bypass_supported(pdev, mask);
+}
+
 /* We support DMA to/from any memory page via the iommu */
 int dma_iommu_dma_supported(struct device *dev, u64 mask)
 {
@@ -83,32 +124,48 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
                return 0;
        }
 
+       if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
+               dev->archdata.iommu_bypass = true;
+               dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
+               return 1;
+       }
+
        if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
                dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
                dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
                                mask, tbl->it_offset << tbl->it_page_shift);
                return 0;
-       } else
-               return 1;
+       }
+
+       dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
+       dev->archdata.iommu_bypass = false;
+       return 1;
 }
 
-static u64 dma_iommu_get_required_mask(struct device *dev)
+u64 dma_iommu_get_required_mask(struct device *dev)
 {
        struct iommu_table *tbl = get_iommu_table_base(dev);
        u64 mask;
+
        if (!tbl)
                return 0;
 
+       if (dev_is_pci(dev)) {
+               u64 bypass_mask = dma_direct_get_required_mask(dev);
+
+               if (dma_iommu_bypass_supported(dev, bypass_mask))
+                       return bypass_mask;
+       }
+
        mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1);
        mask += mask - 1;
 
        return mask;
 }
 
-struct dma_map_ops dma_iommu_ops = {
+const struct dma_map_ops dma_iommu_ops = {
        .alloc                  = dma_iommu_alloc_coherent,
        .free                   = dma_iommu_free_coherent,
-       .mmap                   = dma_nommu_mmap_coherent,
        .map_sg                 = dma_iommu_map_sg,
        .unmap_sg               = dma_iommu_unmap_sg,
        .dma_supported          = dma_iommu_dma_supported,
diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c
new file mode 100644 (file)
index 0000000..ffbbbc4
--- /dev/null
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/dma-mapping.h>
+#include <linux/export.h>
+#include <asm/machdep.h>
+
+void arch_dma_set_mask(struct device *dev, u64 dma_mask)
+{
+       if (ppc_md.dma_set_mask)
+               ppc_md.dma_set_mask(dev, dma_mask);
+}
+EXPORT_SYMBOL(arch_dma_set_mask);
index 7d5fc9751622323984a53eee4b17f95620d11b1a..132d61c91629e35608db3cb70c495dcedfcd81e0 100644 (file)
  * option) any later version.
  *
  */
-
-#include <linux/dma-direct.h>
 #include <linux/memblock.h>
-#include <linux/pfn.h>
-#include <linux/of_platform.h>
-#include <linux/platform_device.h>
-#include <linux/pci.h>
-
 #include <asm/machdep.h>
 #include <asm/swiotlb.h>
-#include <asm/dma.h>
 
 unsigned int ppc_swiotlb_enable;
 
-static u64 swiotlb_powerpc_get_required(struct device *dev)
-{
-       u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr;
-
-       end = memblock_end_of_DRAM();
-       if (max_direct_dma_addr && end > max_direct_dma_addr)
-               end = max_direct_dma_addr;
-       end += get_dma_offset(dev);
-
-       mask = 1ULL << (fls64(end) - 1);
-       mask += mask - 1;
-
-       return mask;
-}
-
-/*
- * At the moment, all platforms that use this code only require
- * swiotlb to be used if we're operating on HIGHMEM.  Since
- * we don't ever call anything other than map_sg, unmap_sg,
- * map_page, and unmap_page on highmem, use normal dma_ops
- * for everything else.
- */
-const struct dma_map_ops powerpc_swiotlb_dma_ops = {
-       .alloc = __dma_nommu_alloc_coherent,
-       .free = __dma_nommu_free_coherent,
-       .mmap = dma_nommu_mmap_coherent,
-       .map_sg = dma_direct_map_sg,
-       .unmap_sg = dma_direct_unmap_sg,
-       .dma_supported = swiotlb_dma_supported,
-       .map_page = dma_direct_map_page,
-       .unmap_page = dma_direct_unmap_page,
-       .sync_single_for_cpu = dma_direct_sync_single_for_cpu,
-       .sync_single_for_device = dma_direct_sync_single_for_device,
-       .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
-       .sync_sg_for_device = dma_direct_sync_sg_for_device,
-       .get_required_mask = swiotlb_powerpc_get_required,
-};
-
-void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
-{
-       struct pci_controller *hose;
-       struct dev_archdata *sd;
-
-       hose = pci_bus_to_host(pdev->bus);
-       sd = &pdev->dev.archdata;
-       sd->max_direct_dma_addr =
-               hose->dma_window_base_cur + hose->dma_window_size;
-}
-
-static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
-                                 unsigned long action, void *data)
-{
-       struct device *dev = data;
-       struct dev_archdata *sd;
-
-       /* We are only intereted in device addition */
-       if (action != BUS_NOTIFY_ADD_DEVICE)
-               return 0;
-
-       sd = &dev->archdata;
-       sd->max_direct_dma_addr = 0;
-
-       /* May need to bounce if the device can't address all of DRAM */
-       if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM())
-               set_dma_ops(dev, &powerpc_swiotlb_dma_ops);
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block ppc_swiotlb_plat_bus_notifier = {
-       .notifier_call = ppc_swiotlb_bus_notify,
-       .priority = 0,
-};
-
-int __init swiotlb_setup_bus_notifier(void)
-{
-       bus_register_notifier(&platform_bus_type,
-                             &ppc_swiotlb_plat_bus_notifier);
-       return 0;
-}
-
 void __init swiotlb_detect_4g(void)
 {
        if ((memblock_end_of_DRAM() - 1) > 0xffffffff)
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
deleted file mode 100644 (file)
index b1903eb..0000000
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation
- *
- * Provide default implementations of the DMA mapping callbacks for
- * directly mapped busses.
- */
-
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
-#include <linux/gfp.h>
-#include <linux/memblock.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <asm/vio.h>
-#include <asm/bug.h>
-#include <asm/machdep.h>
-#include <asm/swiotlb.h>
-#include <asm/iommu.h>
-
-/*
- * Generic direct DMA implementation
- *
- * This implementation supports a per-device offset that can be applied if
- * the address at which memory is visible to devices is not 0. Platform code
- * can set archdata.dma_data to an unsigned long holding the offset. By
- * default the offset is PCI_DRAM_OFFSET.
- */
-
-static u64 __maybe_unused get_pfn_limit(struct device *dev)
-{
-       u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
-       struct dev_archdata __maybe_unused *sd = &dev->archdata;
-
-#ifdef CONFIG_SWIOTLB
-       if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops)
-               pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
-#endif
-
-       return pfn;
-}
-
-static int dma_nommu_dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PPC64
-       u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1);
-
-       /* Limit fits in the mask, we are good */
-       if (mask >= limit)
-               return 1;
-
-#ifdef CONFIG_FSL_SOC
-       /*
-        * Freescale gets another chance via ZONE_DMA, however
-        * that will have to be refined if/when they support iommus
-        */
-       return 1;
-#endif
-       /* Sorry ... */
-       return 0;
-#else
-       return 1;
-#endif
-}
-
-#ifndef CONFIG_NOT_COHERENT_CACHE
-void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
-                                 dma_addr_t *dma_handle, gfp_t flag,
-                                 unsigned long attrs)
-{
-       void *ret;
-       struct page *page;
-       int node = dev_to_node(dev);
-#ifdef CONFIG_FSL_SOC
-       u64 pfn = get_pfn_limit(dev);
-       int zone;
-
-       /*
-        * This code should be OK on other platforms, but we have drivers that
-        * don't set coherent_dma_mask. As a workaround we just ifdef it. This
-        * whole routine needs some serious cleanup.
-        */
-
-       zone = dma_pfn_limit_to_zone(pfn);
-       if (zone < 0) {
-               dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
-                       __func__, pfn);
-               return NULL;
-       }
-
-       switch (zone) {
-#ifdef CONFIG_ZONE_DMA
-       case ZONE_DMA:
-               flag |= GFP_DMA;
-               break;
-#endif
-       };
-#endif /* CONFIG_FSL_SOC */
-
-       page = alloc_pages_node(node, flag, get_order(size));
-       if (page == NULL)
-               return NULL;
-       ret = page_address(page);
-       memset(ret, 0, size);
-       *dma_handle = __pa(ret) + get_dma_offset(dev);
-
-       return ret;
-}
-
-void __dma_nommu_free_coherent(struct device *dev, size_t size,
-                               void *vaddr, dma_addr_t dma_handle,
-                               unsigned long attrs)
-{
-       free_pages((unsigned long)vaddr, get_order(size));
-}
-#endif /* !CONFIG_NOT_COHERENT_CACHE */
-
-static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
-                                      dma_addr_t *dma_handle, gfp_t flag,
-                                      unsigned long attrs)
-{
-       struct iommu_table *iommu;
-
-       /* The coherent mask may be smaller than the real mask, check if
-        * we can really use the direct ops
-        */
-       if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
-               return __dma_nommu_alloc_coherent(dev, size, dma_handle,
-                                                  flag, attrs);
-
-       /* Ok we can't ... do we have an iommu ? If not, fail */
-       iommu = get_iommu_table_base(dev);
-       if (!iommu)
-               return NULL;
-
-       /* Try to use the iommu */
-       return iommu_alloc_coherent(dev, iommu, size, dma_handle,
-                                   dev->coherent_dma_mask, flag,
-                                   dev_to_node(dev));
-}
-
-static void dma_nommu_free_coherent(struct device *dev, size_t size,
-                                    void *vaddr, dma_addr_t dma_handle,
-                                    unsigned long attrs)
-{
-       struct iommu_table *iommu;
-
-       /* See comments in dma_nommu_alloc_coherent() */
-       if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
-               return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle,
-                                                 attrs);
-       /* Maybe we used an iommu ... */
-       iommu = get_iommu_table_base(dev);
-
-       /* If we hit that we should have never allocated in the first
-        * place so how come we are freeing ?
-        */
-       if (WARN_ON(!iommu))
-               return;
-       iommu_free_coherent(iommu, size, vaddr, dma_handle);
-}
-
-int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
-                            void *cpu_addr, dma_addr_t handle, size_t size,
-                            unsigned long attrs)
-{
-       unsigned long pfn;
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-       pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
-#else
-       pfn = page_to_pfn(virt_to_page(cpu_addr));
-#endif
-       return remap_pfn_range(vma, vma->vm_start,
-                              pfn + vma->vm_pgoff,
-                              vma->vm_end - vma->vm_start,
-                              vma->vm_page_prot);
-}
-
-static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
-                            int nents, enum dma_data_direction direction,
-                            unsigned long attrs)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i) {
-               sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
-               sg->dma_length = sg->length;
-
-               if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
-                       continue;
-
-               __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
-       }
-
-       return nents;
-}
-
-static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl,
-                               int nents, enum dma_data_direction direction,
-                               unsigned long attrs)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i)
-               __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
-}
-
-static u64 dma_nommu_get_required_mask(struct device *dev)
-{
-       u64 end, mask;
-
-       end = memblock_end_of_DRAM() + get_dma_offset(dev);
-
-       mask = 1ULL << (fls64(end) - 1);
-       mask += mask - 1;
-
-       return mask;
-}
-
-static inline dma_addr_t dma_nommu_map_page(struct device *dev,
-                                            struct page *page,
-                                            unsigned long offset,
-                                            size_t size,
-                                            enum dma_data_direction dir,
-                                            unsigned long attrs)
-{
-       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-               __dma_sync_page(page, offset, size, dir);
-
-       return page_to_phys(page) + offset + get_dma_offset(dev);
-}
-
-static inline void dma_nommu_unmap_page(struct device *dev,
-                                        dma_addr_t dma_address,
-                                        size_t size,
-                                        enum dma_data_direction direction,
-                                        unsigned long attrs)
-{
-       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-               __dma_sync(bus_to_virt(dma_address), size, direction);
-}
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-static inline void dma_nommu_sync_sg(struct device *dev,
-               struct scatterlist *sgl, int nents,
-               enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i)
-               __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
-}
-
-static inline void dma_nommu_sync_single(struct device *dev,
-                                         dma_addr_t dma_handle, size_t size,
-                                         enum dma_data_direction direction)
-{
-       __dma_sync(bus_to_virt(dma_handle), size, direction);
-}
-#endif
-
-const struct dma_map_ops dma_nommu_ops = {
-       .alloc                          = dma_nommu_alloc_coherent,
-       .free                           = dma_nommu_free_coherent,
-       .mmap                           = dma_nommu_mmap_coherent,
-       .map_sg                         = dma_nommu_map_sg,
-       .unmap_sg                       = dma_nommu_unmap_sg,
-       .dma_supported                  = dma_nommu_dma_supported,
-       .map_page                       = dma_nommu_map_page,
-       .unmap_page                     = dma_nommu_unmap_page,
-       .get_required_mask              = dma_nommu_get_required_mask,
-#ifdef CONFIG_NOT_COHERENT_CACHE
-       .sync_single_for_cpu            = dma_nommu_sync_single,
-       .sync_single_for_device         = dma_nommu_sync_single,
-       .sync_sg_for_cpu                = dma_nommu_sync_sg,
-       .sync_sg_for_device             = dma_nommu_sync_sg,
-#endif
-};
-EXPORT_SYMBOL(dma_nommu_ops);
-
-int dma_set_coherent_mask(struct device *dev, u64 mask)
-{
-       if (!dma_supported(dev, mask)) {
-               /*
-                * We need to special case the direct DMA ops which can
-                * support a fallback for coherent allocations. There
-                * is no dma_op->set_coherent_mask() so we have to do
-                * things the hard way:
-                */
-               if (get_dma_ops(dev) != &dma_nommu_ops ||
-                   get_iommu_table_base(dev) == NULL ||
-                   !dma_iommu_dma_supported(dev, mask))
-                       return -EIO;
-       }
-       dev->coherent_dma_mask = mask;
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_coherent_mask);
-
-int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (ppc_md.dma_set_mask)
-               return ppc_md.dma_set_mask(dev, dma_mask);
-
-       if (dev_is_pci(dev)) {
-               struct pci_dev *pdev = to_pci_dev(dev);
-               struct pci_controller *phb = pci_bus_to_host(pdev->bus);
-               if (phb->controller_ops.dma_set_mask)
-                       return phb->controller_ops.dma_set_mask(pdev, dma_mask);
-       }
-
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-       *dev->dma_mask = dma_mask;
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-u64 __dma_get_required_mask(struct device *dev)
-{
-       const struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (unlikely(dma_ops == NULL))
-               return 0;
-
-       if (dma_ops->get_required_mask)
-               return dma_ops->get_required_mask(dev);
-
-       return DMA_BIT_MASK(8 * sizeof(dma_addr_t));
-}
-
-u64 dma_get_required_mask(struct device *dev)
-{
-       if (ppc_md.dma_get_required_mask)
-               return ppc_md.dma_get_required_mask(dev);
-
-       if (dev_is_pci(dev)) {
-               struct pci_dev *pdev = to_pci_dev(dev);
-               struct pci_controller *phb = pci_bus_to_host(pdev->bus);
-               if (phb->controller_ops.dma_get_required_mask)
-                       return phb->controller_ops.dma_get_required_mask(pdev);
-       }
-
-       return __dma_get_required_mask(dev);
-}
-EXPORT_SYMBOL_GPL(dma_get_required_mask);
-
-static int __init dma_init(void)
-{
-#ifdef CONFIG_IBMVIO
-       dma_debug_add_bus(&vio_bus_type);
-#endif
-
-       return 0;
-}
-fs_initcall(dma_init);
-
index 8be3721d93026376fe42051461f5632ba2b79989..e49bd5efcfe66b5092f241bca00a2688ce5ac396 100644 (file)
@@ -666,8 +666,10 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
                m = &dt_cpu_feature_match_table[i];
                if (!strcmp(f->name, m->name)) {
                        known = true;
-                       if (m->enable(f))
+                       if (m->enable(f)) {
+                               cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
                                break;
+                       }
 
                        pr_info("not enabling: %s (disabled or unsupported by kernel)\n",
                                f->name);
@@ -675,17 +677,12 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
                }
        }
 
-       if (!known && enable_unknown) {
-               if (!feat_try_enable_unknown(f)) {
-                       pr_info("not enabling: %s (unknown and unsupported by kernel)\n",
-                               f->name);
-                       return false;
-               }
+       if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) {
+               pr_info("not enabling: %s (unknown and unsupported by kernel)\n",
+                       f->name);
+               return false;
        }
 
-       if (m->cpu_ftr_bit_mask)
-               cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
-
        if (known)
                pr_debug("enabling: %s\n", f->name);
        else
index ae05203eb4de67ef69e5673d4111ac646049c348..289c0b37d84572aa341d14b2b36a1068abd77de3 100644 (file)
@@ -109,7 +109,14 @@ EXPORT_SYMBOL(eeh_subsystem_flags);
  * frozen count in last hour exceeds this limit, the PE will
  * be forced to be offline permanently.
  */
-int eeh_max_freezes = 5;
+u32 eeh_max_freezes = 5;
+
+/*
+ * Controls whether a recovery event should be scheduled when an
+ * isolated device is discovered. This is only really useful for
+ * debugging problems with the EEH core.
+ */
+bool eeh_debugfs_no_recover;
 
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
@@ -823,15 +830,15 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
        switch (state) {
        case pcie_deassert_reset:
                eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
-               eeh_unfreeze_pe(pe, false);
+               eeh_unfreeze_pe(pe);
                if (!(pe->type & EEH_PE_VF))
-                       eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+                       eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
                eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
-               eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+               eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
                break;
        case pcie_hot_reset:
                eeh_pe_mark_isolated(pe);
-               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
                eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
                eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
                if (!(pe->type & EEH_PE_VF))
@@ -840,7 +847,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
                break;
        case pcie_warm_reset:
                eeh_pe_mark_isolated(pe);
-               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
                eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
                eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
                if (!(pe->type & EEH_PE_VF))
@@ -848,7 +855,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
                eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
                break;
        default:
-               eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED);
+               eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true);
                return -EINVAL;
        };
 
@@ -877,6 +884,24 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
        return NULL;
 }
 
+static void eeh_pe_refreeze_passed(struct eeh_pe *root)
+{
+       struct eeh_pe *pe;
+       int state;
+
+       eeh_for_each_pe(root, pe) {
+               if (eeh_pe_passed(pe)) {
+                       state = eeh_ops->get_state(pe, NULL);
+                       if (state &
+                          (EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED)) {
+                               pr_info("EEH: Passed-through PE PHB#%x-PE#%x was thawed by reset, re-freezing for safety.\n",
+                                       pe->phb->global_number, pe->addr);
+                               eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE);
+                       }
+               }
+       }
+}
+
 /**
  * eeh_pe_reset_full - Complete a full reset process on the indicated PE
  * @pe: EEH PE
@@ -889,12 +914,12 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
  *
  * This function will attempt to reset a PE three times before failing.
  */
-int eeh_pe_reset_full(struct eeh_pe *pe)
+int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed)
 {
        int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
        int type = EEH_RESET_HOT;
        unsigned int freset = 0;
-       int i, state, ret;
+       int i, state = 0, ret;
 
        /*
         * Determine the type of reset to perform - hot or fundamental.
@@ -911,32 +936,42 @@ int eeh_pe_reset_full(struct eeh_pe *pe)
 
        /* Make three attempts at resetting the bus */
        for (i = 0; i < 3; i++) {
-               ret = eeh_pe_reset(pe, type);
-               if (ret)
-                       break;
-
-               ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
-               if (ret)
-                       break;
+               ret = eeh_pe_reset(pe, type, include_passed);
+               if (!ret)
+                       ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE,
+                                          include_passed);
+               if (ret) {
+                       ret = -EIO;
+                       pr_warn("EEH: Failure %d resetting PHB#%x-PE#%x (attempt %d)\n\n",
+                               state, pe->phb->global_number, pe->addr, i + 1);
+                       continue;
+               }
+               if (i)
+                       pr_warn("EEH: PHB#%x-PE#%x: Successful reset (attempt %d)\n",
+                               pe->phb->global_number, pe->addr, i + 1);
 
                /* Wait until the PE is in a functioning state */
                state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
                if (state < 0) {
-                       pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x",
-                               __func__, pe->phb->global_number, pe->addr);
+                       pr_warn("EEH: Unrecoverable slot failure on PHB#%x-PE#%x",
+                               pe->phb->global_number, pe->addr);
                        ret = -ENOTRECOVERABLE;
                        break;
                }
                if (eeh_state_active(state))
                        break;
-
-               /* Set error in case this is our last attempt */
-               ret = -EIO;
-               pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n",
-                       __func__, state, pe->phb->global_number, pe->addr, (i + 1));
+               else
+                       pr_warn("EEH: PHB#%x-PE#%x: Slot inactive after reset: 0x%x (attempt %d)\n",
+                               pe->phb->global_number, pe->addr, state, i + 1);
        }
 
-       eeh_pe_state_clear(pe, reset_state);
+       /* Resetting the PE may have unfrozen child PEs. If those PEs have been
+        * (potentially) passed through to a guest, re-freeze them:
+        */
+       if (!include_passed)
+               eeh_pe_refreeze_passed(pe);
+
+       eeh_pe_state_clear(pe, reset_state, true);
        return ret;
 }
 
@@ -1309,7 +1344,7 @@ void eeh_remove_device(struct pci_dev *dev)
        edev->mode &= ~EEH_DEV_SYSFS;
 }
 
-int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state)
+int eeh_unfreeze_pe(struct eeh_pe *pe)
 {
        int ret;
 
@@ -1327,10 +1362,6 @@ int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state)
                return ret;
        }
 
-       /* Clear software isolated state */
-       if (sw_state && (pe->state & EEH_PE_ISOLATED))
-               eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
-
        return ret;
 }
 
@@ -1382,7 +1413,10 @@ static int eeh_pe_change_owner(struct eeh_pe *pe)
                }
        }
 
-       return eeh_unfreeze_pe(pe, true);
+       ret = eeh_unfreeze_pe(pe);
+       if (!ret)
+               eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
+       return ret;
 }
 
 /**
@@ -1612,13 +1646,12 @@ int eeh_pe_get_state(struct eeh_pe *pe)
 }
 EXPORT_SYMBOL_GPL(eeh_pe_get_state);
 
-static int eeh_pe_reenable_devices(struct eeh_pe *pe)
+static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed)
 {
        struct eeh_dev *edev, *tmp;
        struct pci_dev *pdev;
        int ret = 0;
 
-       /* Restore config space */
        eeh_pe_restore_bars(pe);
 
        /*
@@ -1639,7 +1672,14 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe)
        }
 
        /* The PE is still in frozen state */
-       return eeh_unfreeze_pe(pe, true);
+       if (include_passed || !eeh_pe_passed(pe)) {
+               ret = eeh_unfreeze_pe(pe);
+       } else
+               pr_info("EEH: Note: Leaving passthrough PHB#%x-PE#%x frozen.\n",
+                       pe->phb->global_number, pe->addr);
+       if (!ret)
+               eeh_pe_state_clear(pe, EEH_PE_ISOLATED, include_passed);
+       return ret;
 }
 
 
@@ -1652,7 +1692,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe)
  * indicated type, either fundamental reset or hot reset.
  * PE reset is the most important part for error recovery.
  */
-int eeh_pe_reset(struct eeh_pe *pe, int option)
+int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed)
 {
        int ret = 0;
 
@@ -1666,11 +1706,11 @@ int eeh_pe_reset(struct eeh_pe *pe, int option)
        switch (option) {
        case EEH_RESET_DEACTIVATE:
                ret = eeh_ops->reset(pe, option);
-               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+               eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, include_passed);
                if (ret)
                        break;
 
-               ret = eeh_pe_reenable_devices(pe);
+               ret = eeh_pe_reenable_devices(pe, include_passed);
                break;
        case EEH_RESET_HOT:
        case EEH_RESET_FUNDAMENTAL:
@@ -1796,22 +1836,64 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val)
        return 0;
 }
 
-static int eeh_freeze_dbgfs_set(void *data, u64 val)
-{
-       eeh_max_freezes = val;
-       return 0;
-}
+DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
+                        eeh_enable_dbgfs_set, "0x%llx\n");
 
-static int eeh_freeze_dbgfs_get(void *data, u64 *val)
+static ssize_t eeh_force_recover_write(struct file *filp,
+                               const char __user *user_buf,
+                               size_t count, loff_t *ppos)
 {
-       *val = eeh_max_freezes;
-       return 0;
+       struct pci_controller *hose;
+       uint32_t phbid, pe_no;
+       struct eeh_pe *pe;
+       char buf[20];
+       int ret;
+
+       ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+       if (!ret)
+               return -EFAULT;
+
+       /*
+        * When PE is NULL the event is a "special" event. Rather than
+        * recovering a specific PE it forces the EEH core to scan for failed
+        * PHBs and recovers each. This needs to be done before any device
+        * recoveries can occur.
+        */
+       if (!strncmp(buf, "hwcheck", 7)) {
+               __eeh_send_failure_event(NULL);
+               return count;
+       }
+
+       ret = sscanf(buf, "%x:%x", &phbid, &pe_no);
+       if (ret != 2)
+               return -EINVAL;
+
+       hose = pci_find_controller_for_domain(phbid);
+       if (!hose)
+               return -ENODEV;
+
+       /* Retrieve PE */
+       pe = eeh_pe_get(hose, pe_no, 0);
+       if (!pe)
+               return -ENODEV;
+
+       /*
+        * We don't do any state checking here since the detection
+        * process is async to the recovery process. The recovery
+        * thread *should* not break even if we schedule a recovery
+        * from an odd state (e.g. PE removed, or recovery of a
+        * non-isolated PE)
+        */
+       __eeh_send_failure_event(pe);
+
+       return ret < 0 ? ret : count;
 }
 
-DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
-                        eeh_enable_dbgfs_set, "0x%llx\n");
-DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
-                        eeh_freeze_dbgfs_set, "0x%llx\n");
+static const struct file_operations eeh_force_recover_fops = {
+       .open   = simple_open,
+       .llseek = no_llseek,
+       .write  = eeh_force_recover_write,
+};
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1822,9 +1904,15 @@ static int __init eeh_init_proc(void)
                debugfs_create_file_unsafe("eeh_enable", 0600,
                                           powerpc_debugfs_root, NULL,
                                           &eeh_enable_dbgfs_ops);
-               debugfs_create_file_unsafe("eeh_max_freezes", 0600,
-                                          powerpc_debugfs_root, NULL,
-                                          &eeh_freeze_dbgfs_ops);
+               debugfs_create_u32("eeh_max_freezes", 0600,
+                               powerpc_debugfs_root, &eeh_max_freezes);
+               debugfs_create_bool("eeh_disable_recovery", 0600,
+                               powerpc_debugfs_root,
+                               &eeh_debugfs_no_recover);
+               debugfs_create_file_unsafe("eeh_force_recover", 0600,
+                               powerpc_debugfs_root, NULL,
+                               &eeh_force_recover_fops);
+               eeh_cache_debugfs_init();
 #endif
        }
 
index 201943d54a6ece9a022023e01e7453521e030fb3..9c68f0837385749cf336c290889e06d96a6c3be9 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
 #include <asm/pci-bridge.h>
+#include <asm/debugfs.h>
 #include <asm/ppc-pci.h>
 
 
@@ -113,7 +114,7 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
        while (n) {
                struct pci_io_addr_range *piar;
                piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-               pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n",
+               pr_info("PCI: %s addr range %d [%pap-%pap]: %s\n",
                       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
                       &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev));
                cnt++;
@@ -157,10 +158,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
        piar->pcidev = dev;
        piar->flags = flags;
 
-#ifdef DEBUG
        pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
                 &alo, &ahi, pci_name(dev));
-#endif
 
        rb_link_node(&piar->rb_node, parent, p);
        rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -240,6 +239,8 @@ restart:
                piar = rb_entry(n, struct pci_io_addr_range, rb_node);
 
                if (piar->pcidev == dev) {
+                       pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n",
+                                &piar->addr_lo, &piar->addr_hi, pci_name(dev));
                        rb_erase(n, &pci_io_addr_cache_root.rb_root);
                        kfree(piar);
                        goto restart;
@@ -298,9 +299,30 @@ void eeh_addr_cache_build(void)
                eeh_addr_cache_insert_dev(dev);
                eeh_sysfs_add_device(dev);
        }
+}
 
-#ifdef DEBUG
-       /* Verify tree built up above, echo back the list of addrs. */
-       eeh_addr_cache_print(&pci_io_addr_cache_root);
-#endif
+static int eeh_addr_cache_show(struct seq_file *s, void *v)
+{
+       struct pci_io_addr_range *piar;
+       struct rb_node *n;
+
+       spin_lock(&pci_io_addr_cache_root.piar_lock);
+       for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) {
+               piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+               seq_printf(s, "%s addr range [%pap-%pap]: %s\n",
+                      (piar->flags & IORESOURCE_IO) ? "i/o" : "mem",
+                      &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev));
+       }
+       spin_unlock(&pci_io_addr_cache_root.piar_lock);
+
+       return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache);
+
+void eeh_cache_debugfs_init(void)
+{
+       debugfs_create_file_unsafe("eeh_address_cache", 0400,
+                       powerpc_debugfs_root, NULL,
+                       &eeh_addr_cache_fops);
 }
index 99eab7bc7edc64d20e79130ba0176b4d97aedf57..89623962c7275235db369f3bf28581e13ec019cb 100644 (file)
@@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
         * support EEH. So we just care about PCI devices for
         * simplicity here.
         */
-       if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
-               return NULL;
-
-       /*
-        * We rely on count-based pcibios_release_device() to
-        * detach permanently offlined PEs. Unfortunately, that's
-        * not reliable enough. We might have the permanently
-        * offlined PEs attached, but we needn't take care of
-        * them and their child devices.
-        */
-       if (eeh_dev_removed(edev))
+       if (!eeh_edev_actionable(edev) ||
+           (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
                return NULL;
 
        if (rmv_data) {
-               if (eeh_pe_passed(edev->pe))
-                       return NULL;
                driver = eeh_pcid_get(dev);
                if (driver) {
                        if (driver->err_handler &&
@@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
        }
 
        /* Remove it from PCI subsystem */
-       pr_debug("EEH: Removing %s without EEH sensitive driver\n",
-                pci_name(dev));
+       pr_info("EEH: Removing %s without EEH sensitive driver\n",
+               pci_name(dev));
        edev->mode |= EEH_DEV_DISCONNECTED;
        if (rmv_data)
                rmv_data->removed_dev_count++;
@@ -591,34 +580,22 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
  * PE reset (for 3 times), we try to clear the frozen state
  * for 3 times as well.
  */
-static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag)
+static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed)
 {
-       bool clear_sw_state = *(bool *)flag;
-       int i, rc = 1;
-
-       for (i = 0; rc && i < 3; i++)
-               rc = eeh_unfreeze_pe(pe, clear_sw_state);
+       struct eeh_pe *pe;
+       int i;
 
-       /* Stop immediately on any errors */
-       if (rc) {
-               pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n",
-                       __func__, rc, pe->phb->global_number, pe->addr);
-               return (void *)pe;
+       eeh_for_each_pe(root, pe) {
+               if (include_passed || !eeh_pe_passed(pe)) {
+                       for (i = 0; i < 3; i++)
+                               if (!eeh_unfreeze_pe(pe))
+                                       break;
+                       if (i >= 3)
+                               return -EIO;
+               }
        }
-
-       return NULL;
-}
-
-static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
-                                    bool clear_sw_state)
-{
-       void *rc;
-
-       rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state);
-       if (!rc)
-               eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
-
-       return rc ? -EIO : 0;
+       eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed);
+       return 0;
 }
 
 int eeh_pe_reset_and_recover(struct eeh_pe *pe)
@@ -636,16 +613,16 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
        eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
 
        /* Issue reset */
-       ret = eeh_pe_reset_full(pe);
+       ret = eeh_pe_reset_full(pe, true);
        if (ret) {
-               eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+               eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
                return ret;
        }
 
        /* Unfreeze the PE */
        ret = eeh_clear_pe_frozen_state(pe, true);
        if (ret) {
-               eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+               eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
                return ret;
        }
 
@@ -653,7 +630,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
        eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
 
        /* Clear recovery mode */
-       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+       eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 
        return 0;
 }
@@ -676,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
        time64_t tstamp;
        int cnt, rc;
        struct eeh_dev *edev;
+       struct eeh_pe *tmp_pe;
+       bool any_passed = false;
+
+       eeh_for_each_pe(pe, tmp_pe)
+               any_passed |= eeh_pe_passed(tmp_pe);
 
        /* pcibios will clear the counter; save the value */
        cnt = pe->freeze_count;
@@ -688,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
         * into pci_hp_add_devices().
         */
        eeh_pe_state_mark(pe, EEH_PE_KEEP);
-       if (driver_eeh_aware || (pe->type & EEH_PE_VF)) {
+       if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
                eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
        } else {
                pci_lock_rescan_remove();
@@ -705,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
         * config accesses. So we prefer to block them. However, controlled
         * PCI config accesses initiated from EEH itself are allowed.
         */
-       rc = eeh_pe_reset_full(pe);
+       rc = eeh_pe_reset_full(pe, false);
        if (rc)
                return rc;
 
@@ -744,11 +726,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
                        eeh_add_virt_device(edev);
                } else {
                        if (!driver_eeh_aware)
-                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
                        pci_hp_add_devices(bus);
                }
        }
-       eeh_pe_state_clear(pe, EEH_PE_KEEP);
+       eeh_pe_state_clear(pe, EEH_PE_KEEP, true);
 
        pe->tstamp = tstamp;
        pe->freeze_count = cnt;
@@ -900,7 +882,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
                         * is still in frozen state. Clear it before
                         * resuming the PE.
                         */
-                       eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+                       eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
                        result = PCI_ERS_RESULT_RECOVERED;
                }
        }
@@ -977,7 +959,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
                        eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
                        eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
                } else {
-                       eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+                       eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
                        eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
 
                        pci_lock_rescan_remove();
@@ -987,7 +969,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
                        return;
                }
        }
-       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+       eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
 /**
@@ -1069,7 +1051,7 @@ void eeh_handle_special_event(void)
                                        continue;
 
                                /* Notify all devices to be down */
-                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
                                eeh_set_channel_state(pe, pci_channel_io_perm_failure);
                                eeh_pe_report(
                                        "error_detected(permanent failure)", pe,
index 227e57f980df155f789e994d440392ad2d916c1b..539aca055d7025e0cf80b7ebefc53a9a1c2e9b71 100644 (file)
@@ -121,7 +121,7 @@ int eeh_event_init(void)
  * the actual event will be delivered in a normal context
  * (from a workqueue).
  */
-int eeh_send_failure_event(struct eeh_pe *pe)
+int __eeh_send_failure_event(struct eeh_pe *pe)
 {
        unsigned long flags;
        struct eeh_event *event;
@@ -144,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe)
        return 0;
 }
 
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+       /*
+        * If we've manually supressed recovery events via debugfs
+        * then just drop it on the floor.
+        */
+       if (eeh_debugfs_no_recover) {
+               pr_err("EEH: Event dropped due to no_recover setting\n");
+               return 0;
+       }
+
+       return __eeh_send_failure_event(pe);
+}
+
 /**
  * eeh_remove_event - Remove EEH event from the queue
  * @pe: Event binding to the PE
index 6fa2032e05945e1fd5f858e0d37bc3040b0316e7..8b578891f27c2f237c85ad2c0381209203b80a86 100644 (file)
@@ -657,62 +657,52 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
 }
 
 /**
- * __eeh_pe_state_clear - Clear state for the PE
+ * eeh_pe_state_clear - Clear state for the PE
  * @data: EEH PE
- * @flag: state
+ * @state: state
+ * @include_passed: include passed-through devices?
  *
  * The function is used to clear the indicated state from the
  * given PE. Besides, we also clear the check count of the PE
  * as well.
  */
-static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag)
+void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed)
 {
-       int state = *((int *)flag);
+       struct eeh_pe *pe;
        struct eeh_dev *edev, *tmp;
        struct pci_dev *pdev;
 
-       /* Keep the state of permanently removed PE intact */
-       if (pe->state & EEH_PE_REMOVED)
-               return NULL;
+       eeh_for_each_pe(root, pe) {
+               /* Keep the state of permanently removed PE intact */
+               if (pe->state & EEH_PE_REMOVED)
+                       continue;
 
-       pe->state &= ~state;
+               if (!include_passed && eeh_pe_passed(pe))
+                       continue;
 
-       /*
-        * Special treatment on clearing isolated state. Clear
-        * check count since last isolation and put all affected
-        * devices to normal state.
-        */
-       if (!(state & EEH_PE_ISOLATED))
-               return NULL;
+               pe->state &= ~state;
 
-       pe->check_count = 0;
-       eeh_pe_for_each_dev(pe, edev, tmp) {
-               pdev = eeh_dev_to_pci_dev(edev);
-               if (!pdev)
+               /*
+                * Special treatment on clearing isolated state. Clear
+                * check count since last isolation and put all affected
+                * devices to normal state.
+                */
+               if (!(state & EEH_PE_ISOLATED))
                        continue;
 
-               pdev->error_state = pci_channel_io_normal;
-       }
-
-       /* Unblock PCI config access if required */
-       if (pe->state & EEH_PE_CFG_RESTRICTED)
-               pe->state &= ~EEH_PE_CFG_BLOCKED;
+               pe->check_count = 0;
+               eeh_pe_for_each_dev(pe, edev, tmp) {
+                       pdev = eeh_dev_to_pci_dev(edev);
+                       if (!pdev)
+                               continue;
 
-       return NULL;
-}
+                       pdev->error_state = pci_channel_io_normal;
+               }
 
-/**
- * eeh_pe_state_clear - Clear state for the PE and its children
- * @pe: PE
- * @state: state to be cleared
- *
- * When the PE and its children has been recovered from error,
- * we need clear the error state for that. The function is used
- * for the purpose.
- */
-void eeh_pe_state_clear(struct eeh_pe *pe, int state)
-{
-       eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+               /* Unblock PCI config access if required */
+               if (pe->state & EEH_PE_CFG_RESTRICTED)
+                       pe->state &= ~EEH_PE_CFG_BLOCKED;
+       }
 }
 
 /*
index deed906dd8f1200e55e3d3ef65d48b37ec9dfb20..3fa04dda17371ad257164a52b2e951a37e54416f 100644 (file)
@@ -82,8 +82,9 @@ static ssize_t eeh_pe_state_store(struct device *dev,
        if (!(edev->pe->state & EEH_PE_ISOLATED))
                return count;
 
-       if (eeh_unfreeze_pe(edev->pe, true))
+       if (eeh_unfreeze_pe(edev->pe))
                return -EIO;
+       eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED, true);
 
        return count;
 }
index 0768dfd8a64e2e70940a2b904a9bebd98890f38c..b61cfd29c76f1fef499b145d37859fbaabf2778d 100644 (file)
@@ -97,14 +97,11 @@ crit_transfer_to_handler:
        mfspr   r0,SPRN_SRR1
        stw     r0,_SRR1(r11)
 
-       /* set the stack limit to the current stack
-        * and set the limit to protect the thread_info
-        * struct
-        */
+       /* set the stack limit to the current stack */
        mfspr   r8,SPRN_SPRG_THREAD
        lwz     r0,KSP_LIMIT(r8)
        stw     r0,SAVED_KSP_LIMIT(r11)
-       rlwimi  r0,r1,0,0,(31-THREAD_SHIFT)
+       rlwinm  r0,r1,0,0,(31 - THREAD_SHIFT)
        stw     r0,KSP_LIMIT(r8)
        /* fall through */
 #endif
@@ -121,14 +118,11 @@ crit_transfer_to_handler:
        mfspr   r0,SPRN_SRR1
        stw     r0,crit_srr1@l(0)
 
-       /* set the stack limit to the current stack
-        * and set the limit to protect the thread_info
-        * struct
-        */
+       /* set the stack limit to the current stack */
        mfspr   r8,SPRN_SPRG_THREAD
        lwz     r0,KSP_LIMIT(r8)
        stw     r0,saved_ksp_limit@l(0)
-       rlwimi  r0,r1,0,0,(31-THREAD_SHIFT)
+       rlwinm  r0,r1,0,0,(31 - THREAD_SHIFT)
        stw     r0,KSP_LIMIT(r8)
        /* fall through */
 #endif
@@ -157,7 +151,6 @@ transfer_to_handler:
        stw     r2,_XER(r11)
        mfspr   r12,SPRN_SPRG_THREAD
        addi    r2,r12,-THREAD
-       tovirt(r2,r2)                   /* set r2 to current */
        beq     2f                      /* if from user, fix up THREAD.regs */
        addi    r11,r1,STACK_FRAME_OVERHEAD
        stw     r11,PT_REGS(r12)
@@ -166,6 +159,9 @@ transfer_to_handler:
           internal debug mode bit to do this. */
        lwz     r12,THREAD_DBCR0(r12)
        andis.  r12,r12,DBCR0_IDM@h
+#endif
+       ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
+#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
        beq+    3f
        /* From user and task is ptraced - load up global dbcr0 */
        li      r12,-1                  /* clear all pending debug events */
@@ -174,8 +170,7 @@ transfer_to_handler:
        tophys(r11,r11)
        addi    r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r9,TI_CPU(r9)
+       lwz     r9,TASK_CPU(r2)
        slwi    r9,r9,3
        add     r11,r11,r9
 #endif
@@ -185,11 +180,6 @@ transfer_to_handler:
        addi    r12,r12,-1
        stw     r12,4(r11)
 #endif
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       CURRENT_THREAD_INFO(r9, r1)
-       tophys(r9, r9)
-       ACCOUNT_CPU_USER_ENTRY(r9, r11, r12)
-#endif
 
        b       3f
 
@@ -201,9 +191,7 @@ transfer_to_handler:
        ble-    stack_ovf               /* then the kernel stack overflowed */
 5:
 #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
-       CURRENT_THREAD_INFO(r9, r1)
-       tophys(r9,r9)                   /* check local flags */
-       lwz     r12,TI_LOCAL_FLAGS(r9)
+       lwz     r12,TI_LOCAL_FLAGS(r2)
        mtcrf   0x01,r12
        bt-     31-TLF_NAPPING,4f
        bt-     31-TLF_SLEEPING,7f
@@ -212,6 +200,7 @@ transfer_to_handler:
 transfer_to_handler_cont:
 3:
        mflr    r9
+       tovirt(r2, r2)                  /* set r2 to current */
        lwz     r11,0(r9)               /* virtual address of handler */
        lwz     r9,4(r9)                /* where to go when done */
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
@@ -275,11 +264,11 @@ reenable_mmu:                             /* re-enable mmu so we can */
 
 #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
 4:     rlwinm  r12,r12,0,~_TLF_NAPPING
-       stw     r12,TI_LOCAL_FLAGS(r9)
+       stw     r12,TI_LOCAL_FLAGS(r2)
        b       power_save_ppc32_restore
 
 7:     rlwinm  r12,r12,0,~_TLF_SLEEPING
-       stw     r12,TI_LOCAL_FLAGS(r9)
+       stw     r12,TI_LOCAL_FLAGS(r2)
        lwz     r9,_MSR(r11)            /* if sleeping, clear MSR.EE */
        rlwinm  r9,r9,0,~MSR_EE
        lwz     r12,_LINK(r11)          /* and return to address in LR */
@@ -351,8 +340,7 @@ _GLOBAL(DoSyscall)
        mtmsr   r11
 1:
 #endif /* CONFIG_TRACE_IRQFLAGS */
-       CURRENT_THREAD_INFO(r10, r1)
-       lwz     r11,TI_FLAGS(r10)
+       lwz     r11,TI_FLAGS(r2)
        andi.   r11,r11,_TIF_SYSCALL_DOTRACE
        bne-    syscall_dotrace
 syscall_dotrace_cont:
@@ -385,13 +373,12 @@ ret_from_syscall:
        lwz     r3,GPR3(r1)
 #endif
        mr      r6,r3
-       CURRENT_THREAD_INFO(r12, r1)
        /* disable interrupts so current_thread_info()->flags can't change */
        LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */
        /* Note: We don't bother telling lockdep about it */
        SYNC
        MTMSRD(r10)
-       lwz     r9,TI_FLAGS(r12)
+       lwz     r9,TI_FLAGS(r2)
        li      r8,-MAX_ERRNO
        andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
        bne-    syscall_exit_work
@@ -438,8 +425,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        andi.   r4,r8,MSR_PR
        beq     3f
-       CURRENT_THREAD_INFO(r4, r1)
-       ACCOUNT_CPU_USER_EXIT(r4, r5, r7)
+       ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
 3:
 #endif
        lwz     r4,_LINK(r1)
@@ -532,7 +518,7 @@ syscall_exit_work:
        /* Clear per-syscall TIF flags if any are set.  */
 
        li      r11,_TIF_PERSYSCALL_MASK
-       addi    r12,r12,TI_FLAGS
+       addi    r12,r2,TI_FLAGS
 3:     lwarx   r8,0,r12
        andc    r8,r8,r11
 #ifdef CONFIG_IBM405_ERR77
@@ -540,7 +526,6 @@ syscall_exit_work:
 #endif
        stwcx.  r8,0,r12
        bne-    3b
-       subi    r12,r12,TI_FLAGS
        
 4:     /* Anything which requires enabling interrupts? */
        andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
@@ -745,6 +730,9 @@ fast_exception_return:
        mtcr    r10
        lwz     r10,_LINK(r11)
        mtlr    r10
+       /* Clear the exception_marker on the stack to avoid confusing stacktrace */
+       li      r10, 0
+       stw     r10, 8(r11)
        REST_GPR(10, r11)
 #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
        mtspr   SPRN_NRI, r0
@@ -819,8 +807,7 @@ ret_from_except:
 
 user_exc_return:               /* r10 contains MSR_KERNEL here */
        /* Check current_thread_info()->flags */
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r9,TI_FLAGS(r9)
+       lwz     r9,TI_FLAGS(r2)
        andi.   r0,r9,_TIF_USER_WORK_MASK
        bne     do_work
 
@@ -832,18 +819,14 @@ restore_user:
        andis.  r10,r0,DBCR0_IDM@h
        bnel-   load_dbcr0
 #endif
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       CURRENT_THREAD_INFO(r9, r1)
-       ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
-#endif
+       ACCOUNT_CPU_USER_EXIT(r2, r10, r11)
 
        b       restore
 
 /* N.B. the only way to get here is from the beq following ret_from_except. */
 resume_kernel:
        /* check current_thread_info, _TIF_EMULATE_STACK_STORE */
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r8,TI_FLAGS(r9)
+       lwz     r8,TI_FLAGS(r2)
        andis.  r0,r8,_TIF_EMULATE_STACK_STORE@h
        beq+    1f
 
@@ -869,7 +852,7 @@ resume_kernel:
 
        /* Clear _TIF_EMULATE_STACK_STORE flag */
        lis     r11,_TIF_EMULATE_STACK_STORE@h
-       addi    r5,r9,TI_FLAGS
+       addi    r5,r2,TI_FLAGS
 0:     lwarx   r8,0,r5
        andc    r8,r8,r11
 #ifdef CONFIG_IBM405_ERR77
@@ -881,7 +864,7 @@ resume_kernel:
 
 #ifdef CONFIG_PREEMPT
        /* check current_thread_info->preempt_count */
-       lwz     r0,TI_PREEMPT(r9)
+       lwz     r0,TI_PREEMPT(r2)
        cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
        bne     restore
        andi.   r8,r8,_TIF_NEED_RESCHED
@@ -897,8 +880,7 @@ resume_kernel:
        bl      trace_hardirqs_off
 #endif
 1:     bl      preempt_schedule_irq
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r3,TI_FLAGS(r9)
+       lwz     r3,TI_FLAGS(r2)
        andi.   r0,r3,_TIF_NEED_RESCHED
        bne-    1b
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -982,6 +964,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
        mtcrf   0xFF,r10
        mtlr    r11
 
+       /* Clear the exception_marker on the stack to avoid confusing stacktrace */
+       li      r10, 0
+       stw     r10, 8(r1)
        /*
         * Once we put values in SRR0 and SRR1, we are in a state
         * where exceptions are not recoverable, since taking an
@@ -997,9 +982,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
        .globl exc_exit_restart
 exc_exit_restart:
        lwz     r12,_NIP(r1)
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-       mtspr   SPRN_NRI, r0
-#endif
        mtspr   SPRN_SRR0,r12
        mtspr   SPRN_SRR1,r9
        REST_4GPRS(9, r1)
@@ -1021,6 +1003,9 @@ exc_exit_restart_end:
        mtlr    r11
        lwz     r10,_CCR(r1)
        mtcrf   0xff,r10
+       /* Clear the exception_marker on the stack to avoid confusing stacktrace */
+       li      r10, 0
+       stw     r10, 8(r1)
        REST_2GPRS(9, r1)
        .globl exc_exit_restart
 exc_exit_restart:
@@ -1166,10 +1151,6 @@ ret_from_debug_exc:
        mfspr   r9,SPRN_SPRG_THREAD
        lwz     r10,SAVED_KSP_LIMIT(r1)
        stw     r10,KSP_LIMIT(r9)
-       lwz     r9,THREAD_INFO-THREAD(r9)
-       CURRENT_THREAD_INFO(r10, r1)
-       lwz     r10,TI_PREEMPT(r10)
-       stw     r10,TI_PREEMPT(r9)
        RESTORE_xSRR(SRR0,SRR1);
        RESTORE_xSRR(CSRR0,CSRR1);
        RESTORE_MMU_REGS;
@@ -1201,8 +1182,7 @@ load_dbcr0:
        lis     r11,global_dbcr0@ha
        addi    r11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r9,TI_CPU(r9)
+       lwz     r9,TASK_CPU(r2)
        slwi    r9,r9,3
        add     r11,r11,r9
 #endif
@@ -1242,8 +1222,7 @@ recheck:
        LOAD_MSR_KERNEL(r10,MSR_KERNEL)
        SYNC
        MTMSRD(r10)             /* disable interrupts */
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r9,TI_FLAGS(r9)
+       lwz     r9,TI_FLAGS(r2)
        andi.   r0,r9,_TIF_NEED_RESCHED
        bne-    do_resched
        andi.   r0,r9,_TIF_USER_WORK_MASK
@@ -1292,10 +1271,13 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_601)
        lwz     r3,_TRAP(r1)
        andi.   r0,r3,1
-       beq     4f
+       beq     5f
        SAVE_NVGPRS(r1)
        rlwinm  r3,r3,0,0,30
        stw     r3,_TRAP(r1)
+5:     mfspr   r2,SPRN_SPRG_THREAD
+       addi    r2,r2,-THREAD
+       tovirt(r2,r2)                   /* set back r2 to current */
 4:     addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      unrecoverable_exception
        /* shouldn't return */
@@ -1335,7 +1317,7 @@ _GLOBAL(enter_rtas)
        MTMSRD(r0)              /* don't get trashed */
        li      r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
        mtlr    r6
-       mtspr   SPRN_SPRG_RTAS,r7
+       stw     r7, THREAD + RTAS_SP(r2)
        mtspr   SPRN_SRR0,r8
        mtspr   SPRN_SRR1,r9
        RFI
@@ -1344,7 +1326,8 @@ _GLOBAL(enter_rtas)
        lwz     r9,8(r9)        /* original msr value */
        addi    r1,r1,INT_FRAME_SIZE
        li      r0,0
-       mtspr   SPRN_SPRG_RTAS,r0
+       tophys(r7, r2)
+       stw     r0, THREAD + RTAS_SP(r7)
        mtspr   SPRN_SRR0,r8
        mtspr   SPRN_SRR1,r9
        RFI                     /* return to caller */
index 435927f549c438614ae680d31775911b3600b417..15c67d2c053435a64a7443dba4c3a0169953db93 100644 (file)
@@ -166,7 +166,7 @@ system_call:                        /* label this so stack traces look sane */
        li      r10,IRQS_ENABLED
        std     r10,SOFTE(r1)
 
-       CURRENT_THREAD_INFO(r11, r1)
+       ld      r11, PACA_THREAD_INFO(r13)
        ld      r10,TI_FLAGS(r11)
        andi.   r11,r10,_TIF_SYSCALL_DOTRACE
        bne     .Lsyscall_dotrace               /* does not return */
@@ -213,7 +213,7 @@ system_call:                        /* label this so stack traces look sane */
        ld      r3,RESULT(r1)
 #endif
 
-       CURRENT_THREAD_INFO(r12, r1)
+       ld      r12, PACA_THREAD_INFO(r13)
 
        ld      r8,_MSR(r1)
 #ifdef CONFIG_PPC_BOOK3S
@@ -236,18 +236,14 @@ system_call_exit:
        /*
         * Disable interrupts so current_thread_info()->flags can't change,
         * and so that we don't get interrupted after loading SRR0/1.
+        *
+        * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we
+        * could fault on the load of the TI_FLAGS below.
         */
 #ifdef CONFIG_PPC_BOOK3E
        wrteei  0
 #else
-       /*
-        * For performance reasons we clear RI the same time that we
-        * clear EE. We only need to clear RI just before we restore r13
-        * below, but batching it with EE saves us one expensive mtmsrd call.
-        * We have to be careful to restore RI if we branch anywhere from
-        * here (eg syscall_exit_work).
-        */
-       li      r11,0
+       li      r11,MSR_RI
        mtmsrd  r11,1
 #endif /* CONFIG_PPC_BOOK3E */
 
@@ -263,15 +259,7 @@ system_call_exit:
        bne     3f
 #endif
 2:     addi    r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
-       li      r10,MSR_RI
-       mtmsrd  r10,1           /* Restore RI */
-#endif
        bl      restore_math
-#ifdef CONFIG_PPC_BOOK3S
-       li      r11,0
-       mtmsrd  r11,1
-#endif
        ld      r8,_MSR(r1)
        ld      r3,RESULT(r1)
        li      r11,-MAX_ERRNO
@@ -287,6 +275,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
        andi.   r6,r8,MSR_PR
        ld      r4,_LINK(r1)
 
+#ifdef CONFIG_PPC_BOOK3S
+       /*
+        * Clear MSR_RI, MSR_EE is already and remains disabled. We could do
+        * this later, but testing shows that doing it here causes less slow
+        * down than doing it closer to the rfid.
+        */
+       li      r11,0
+       mtmsrd  r11,1
+#endif
+
        beq-    1f
        ACCOUNT_CPU_USER_EXIT(r13, r11, r12)
 
@@ -348,7 +346,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        /* Repopulate r9 and r10 for the syscall path */
        addi    r9,r1,STACK_FRAME_OVERHEAD
-       CURRENT_THREAD_INFO(r10, r1)
+       ld      r10, PACA_THREAD_INFO(r13)
        ld      r10,TI_FLAGS(r10)
 
        cmpldi  r0,NR_syscalls
@@ -363,10 +361,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        b       .Lsyscall_exit
        
 .Lsyscall_exit_work:
-#ifdef CONFIG_PPC_BOOK3S
-       li      r10,MSR_RI
-       mtmsrd  r10,1           /* Restore RI */
-#endif
        /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
         If TIF_NOERROR is set, just save r3 as it is. */
 
@@ -695,7 +689,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 2:
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
-       CURRENT_THREAD_INFO(r7, r8)  /* base of new stack */
+       clrrdi  r7, r8, THREAD_SHIFT    /* base of new stack */
        /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
           because we don't need to leave the 288-byte ABI gap at the
           top of the kernel stack. */
@@ -746,7 +740,7 @@ _GLOBAL(ret_from_except_lite)
        mtmsrd  r10,1             /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 
-       CURRENT_THREAD_INFO(r9, r1)
+       ld      r9, PACA_THREAD_INFO(r13)
        ld      r3,_MSR(r1)
 #ifdef CONFIG_PPC_BOOK3E
        ld      r10,PACACURRENT(r13)
@@ -860,7 +854,7 @@ resume_kernel:
 1:     bl      preempt_schedule_irq
 
        /* Re-test flags and eventually loop */
-       CURRENT_THREAD_INFO(r9, r1)
+       ld      r9, PACA_THREAD_INFO(r13)
        ld      r4,TI_FLAGS(r9)
        andi.   r0,r4,_TIF_NEED_RESCHED
        bne     1b
@@ -1002,6 +996,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        ld      r2,_NIP(r1)
        mtspr   SPRN_SRR0,r2
 
+       /*
+        * Leaving a stale exception_marker on the stack can confuse
+        * the reliable stack unwinder later on. Clear it.
+        */
+       li      r2,0
+       std     r2,STACK_FRAME_OVERHEAD-16(r1)
+
        ld      r0,GPR0(r1)
        ld      r2,GPR2(r1)
        ld      r3,GPR3(r1)
index 52ca2471ee1a4355b8fa3063bcc5d27ba61a52a1..d252f4663a231eb7bec1cf21073f53cab0674e31 100644 (file)
 #ifndef CONFIG_PPC64
 /* epapr_ev_idle() was derived from e500_idle() */
 _GLOBAL(epapr_ev_idle)
-       CURRENT_THREAD_INFO(r3, r1)
-       PPC_LL  r4, TI_LOCAL_FLAGS(r3)  /* set napping bit */
+       PPC_LL  r4, TI_LOCAL_FLAGS(r2)  /* set napping bit */
        ori     r4, r4,_TLF_NAPPING     /* so when we take an exception */
-       PPC_STL r4, TI_LOCAL_FLAGS(r3)  /* it will return to our caller */
+       PPC_STL r4, TI_LOCAL_FLAGS(r2)  /* it will return to our caller */
 
        wrteei  1
 
index afb638778f443316cf81d07ff2151d8af068ba97..49381f32b37450e181862bdf2f01e818b81ca20b 100644 (file)
@@ -77,17 +77,6 @@ special_reg_save:
        andi.   r3,r3,MSR_PR
        bnelr
 
-       /* Copy info into temporary exception thread info */
-       ld      r11,PACAKSAVE(r13)
-       CURRENT_THREAD_INFO(r11, r11)
-       CURRENT_THREAD_INFO(r12, r1)
-       ld      r10,TI_FLAGS(r11)
-       std     r10,TI_FLAGS(r12)
-       ld      r10,TI_PREEMPT(r11)
-       std     r10,TI_PREEMPT(r12)
-       ld      r10,TI_TASK(r11)
-       std     r10,TI_TASK(r12)
-
        /*
         * Advance to the next TLB exception frame for handler
         * types that don't do it automatically.
@@ -349,6 +338,7 @@ ret_from_mc_except:
 #define GEN_BTB_FLUSH
 #define CRIT_BTB_FLUSH
 #define DBG_BTB_FLUSH
+#define MC_BTB_FLUSH
 #define GDBELL_BTB_FLUSH
 #endif
 
@@ -504,7 +494,7 @@ exc_##n##_bad_stack:                                                            \
  * interrupts happen before the wait instruction.
  */
 #define CHECK_NAPPING()                                                        \
-       CURRENT_THREAD_INFO(r11, r1);                                   \
+       ld      r11, PACA_THREAD_INFO(r13);                             \
        ld      r10,TI_LOCAL_FLAGS(r11);                                \
        andi.   r9,r10,_TLF_NAPPING;                                    \
        beq+    1f;                                                     \
index 9e253ce27e087c1c333b32e90674b19f625848a4..a5b8fbae56a03b491f0982562f3d590cff16ca5f 100644 (file)
@@ -68,6 +68,14 @@ OPEN_FIXED_SECTION(real_vectors,        0x0100, 0x1900)
 OPEN_FIXED_SECTION(real_trampolines,    0x1900, 0x4000)
 OPEN_FIXED_SECTION(virt_vectors,        0x4000, 0x5900)
 OPEN_FIXED_SECTION(virt_trampolines,    0x5900, 0x7000)
+
+#ifdef CONFIG_PPC_POWERNV
+       .globl start_real_trampolines
+       .globl end_real_trampolines
+       .globl start_virt_trampolines
+       .globl end_virt_trampolines
+#endif
+
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
  * Data area reserved for FWNMI option.
@@ -566,8 +574,36 @@ EXC_COMMON_BEGIN(mce_return)
        RFI_TO_KERNEL
        b       .
 
-EXC_REAL(data_access, 0x300, 0x80)
-EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
+EXC_REAL_BEGIN(data_access, 0x300, 0x80)
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXGEN)
+       b       tramp_real_data_access
+EXC_REAL_END(data_access, 0x300, 0x80)
+
+TRAMP_REAL_BEGIN(tramp_real_data_access)
+EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300)
+       /*
+        * DAR/DSISR must be read before setting MSR[RI], because
+        * a d-side MCE will clobber those registers so is not
+        * recoverable if they are live.
+        */
+       mfspr   r10,SPRN_DAR
+       mfspr   r11,SPRN_DSISR
+       std     r10,PACA_EXGEN+EX_DAR(r13)
+       stw     r11,PACA_EXGEN+EX_DSISR(r13)
+EXCEPTION_PROLOG_2(data_access_common, EXC_STD)
+
+EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXGEN)
+EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300)
+       mfspr   r10,SPRN_DAR
+       mfspr   r11,SPRN_DSISR
+       std     r10,PACA_EXGEN+EX_DAR(r13)
+       stw     r11,PACA_EXGEN+EX_DSISR(r13)
+EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD)
+EXC_VIRT_END(data_access, 0x4300, 0x80)
+
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
 
 EXC_COMMON_BEGIN(data_access_common)
@@ -575,11 +611,8 @@ EXC_COMMON_BEGIN(data_access_common)
         * Here r13 points to the paca, r9 contains the saved CR,
         * SRR0 and SRR1 are saved in r11 and r12,
         * r9 - r13 are saved in paca->exgen.
+        * EX_DAR and EX_DSISR have saved DAR/DSISR
         */
-       mfspr   r10,SPRN_DAR
-       std     r10,PACA_EXGEN+EX_DAR(r13)
-       mfspr   r10,SPRN_DSISR
-       stw     r10,PACA_EXGEN+EX_DSISR(r13)
        EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
        RECONCILE_IRQ_STATE(r10, r11)
        ld      r12,_MSR(r1)
@@ -596,18 +629,29 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
-EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380);
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXSLB)
+       b       tramp_real_data_access_slb
 EXC_REAL_END(data_access_slb, 0x380, 0x80)
 
+TRAMP_REAL_BEGIN(tramp_real_data_access_slb)
+EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+       mfspr   r10,SPRN_DAR
+       std     r10,PACA_EXSLB+EX_DAR(r13)
+EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD)
+
 EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
-EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380);
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXSLB)
+EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+       mfspr   r10,SPRN_DAR
+       std     r10,PACA_EXSLB+EX_DAR(r13)
+EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD)
 EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 
 TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
 
 EXC_COMMON_BEGIN(data_access_slb_common)
-       mfspr   r10,SPRN_DAR
-       std     r10,PACA_EXSLB+EX_DAR(r13)
        EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB)
        ld      r4,PACA_EXSLB+EX_DAR(r13)
        std     r4,_DAR(r1)
@@ -703,14 +747,30 @@ TRAMP_KVM_HV(PACA_EXGEN, 0x500)
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
-EXC_REAL(alignment, 0x600, 0x100)
-EXC_VIRT(alignment, 0x4600, 0x100, 0x600)
-TRAMP_KVM(PACA_EXGEN, 0x600)
-EXC_COMMON_BEGIN(alignment_common)
+EXC_REAL_BEGIN(alignment, 0x600, 0x100)
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXGEN)
+EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600)
        mfspr   r10,SPRN_DAR
+       mfspr   r11,SPRN_DSISR
        std     r10,PACA_EXGEN+EX_DAR(r13)
-       mfspr   r10,SPRN_DSISR
-       stw     r10,PACA_EXGEN+EX_DSISR(r13)
+       stw     r11,PACA_EXGEN+EX_DSISR(r13)
+EXCEPTION_PROLOG_2(alignment_common, EXC_STD)
+EXC_REAL_END(alignment, 0x600, 0x100)
+
+EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
+SET_SCRATCH0(r13)              /* save r13 */
+EXCEPTION_PROLOG_0(PACA_EXGEN)
+EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600)
+       mfspr   r10,SPRN_DAR
+       mfspr   r11,SPRN_DSISR
+       std     r10,PACA_EXGEN+EX_DAR(r13)
+       stw     r11,PACA_EXGEN+EX_DSISR(r13)
+EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD)
+EXC_VIRT_END(alignment, 0x4600, 0x100)
+
+TRAMP_KVM(PACA_EXGEN, 0x600)
+EXC_COMMON_BEGIN(alignment_common)
        EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN)
        ld      r3,PACA_EXGEN+EX_DAR(r13)
        lwz     r4,PACA_EXGEN+EX_DSISR(r13)
@@ -1629,7 +1689,7 @@ do_hash_page:
        ori     r0,r0,DSISR_BAD_FAULT_64S@l
        and.    r0,r4,r0                /* weird error? */
        bne-    handle_page_fault       /* if not, try to insert a HPTE */
-       CURRENT_THREAD_INFO(r11, r1)
+       ld      r11, PACA_THREAD_INFO(r13)
        lwz     r0,TI_PREEMPT(r11)      /* If we're in an "NMI" */
        andis.  r0,r0,NMI_MASK@h        /* (i.e. an irq when soft-disabled) */
        bne     77f                     /* then don't call hash_page now */
index 05b08db3901dc9cb58c55310a7dd6d230127f3cf..ce6a972f25849ea87774be809da0faa3d84d96f3 100644 (file)
@@ -261,7 +261,7 @@ __secondary_hold_acknowledge:
        tophys(r11,r1);                 /* use tophys(r1) if kernel */ \
        beq     1f;             \
        mfspr   r11,SPRN_SPRG_THREAD;   \
-       lwz     r11,THREAD_INFO-THREAD(r11);    \
+       lwz     r11,TASK_STACK-THREAD(r11);     \
        addi    r11,r11,THREAD_SIZE;    \
        tophys(r11,r11);        \
 1:     subi    r11,r11,INT_FRAME_SIZE  /* alloc exc. frame */
@@ -352,9 +352,8 @@ i##n:                                                               \
  * registers that might have bad values includes all the GPRs
  * and all the BATs.  We indicate that we are in RTAS by putting
  * a non-zero value, the address of the exception frame to use,
- * in SPRG2.  The machine check handler checks SPRG2 and uses its
- * value if it is non-zero.  If we ever needed to free up SPRG2,
- * we could use a field in the thread_info or thread_struct instead.
+ * in thread.rtas_sp.  The machine check handler checks thread.rtas_sp
+ * and uses its value if it is non-zero.
  * (Other exception handlers assume that r1 is a valid kernel stack
  * pointer when we take an exception from supervisor mode.)
  *     -- paulus.
@@ -365,16 +364,15 @@ i##n:                                                             \
        mtspr   SPRN_SPRG_SCRATCH1,r11
        mfcr    r10
 #ifdef CONFIG_PPC_CHRP
-       mfspr   r11,SPRN_SPRG_RTAS
-       cmpwi   0,r11,0
-       bne     7f
+       mfspr   r11, SPRN_SPRG_THREAD
+       lwz     r11, RTAS_SP(r11)
+       cmpwi   cr1, r11, 0
+       bne     cr1, 7f
 #endif /* CONFIG_PPC_CHRP */
        EXCEPTION_PROLOG_1
 7:     EXCEPTION_PROLOG_2
        addi    r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_CHRP
-       mfspr   r4,SPRN_SPRG_RTAS
-       cmpwi   cr1,r4,0
        bne     cr1,1f
 #endif
        EXC_XFER_STD(0x200, machine_check_exception)
@@ -500,18 +498,22 @@ InstructionTLBMiss:
  */
        /* Get PTE (linux-style) and check access */
        mfspr   r3,SPRN_IMISS
+#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
        lis     r1,PAGE_OFFSET@h                /* check if kernel address */
        cmplw   0,r1,r3
-       mfspr   r2,SPRN_SPRG_THREAD
-       li      r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */
-       lwz     r2,PGDIR(r2)
+#endif
+       mfspr   r2, SPRN_SPRG_PGDIR
+#ifdef CONFIG_SWAP
+       li      r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+#else
+       li      r1,_PAGE_PRESENT | _PAGE_EXEC
+#endif
+#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
        bge-    112f
-       mfspr   r2,SPRN_SRR1            /* and MSR_PR bit from SRR1 */
-       rlwimi  r1,r2,32-12,29,29       /* shift MSR_PR to _PAGE_USER posn */
-       lis     r2,swapper_pg_dir@ha    /* if kernel address, use */
-       addi    r2,r2,swapper_pg_dir@l  /* kernel page table */
-112:   tophys(r2,r2)
-       rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
+       lis     r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, use */
+       addi    r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l        /* kernel page table */
+#endif
+112:   rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
        lwz     r2,0(r2)                /* get pmd entry */
        rlwinm. r2,r2,0,0,19            /* extract address of pte page */
        beq-    InstructionAddressInvalid       /* return if no mapping */
@@ -519,20 +521,10 @@ InstructionTLBMiss:
        lwz     r0,0(r2)                /* get linux-style pte */
        andc.   r1,r1,r0                /* check access & ~permission */
        bne-    InstructionAddressInvalid /* return if access not permitted */
-       ori     r0,r0,_PAGE_ACCESSED    /* set _PAGE_ACCESSED in pte */
-       /*
-        * NOTE! We are assuming this is not an SMP system, otherwise
-        * we would need to update the pte atomically with lwarx/stwcx.
-        */
-       stw     r0,0(r2)                /* update PTE (accessed bit) */
        /* Convert linux-style PTE to low word of PPC-style PTE */
-       rlwinm  r1,r0,32-10,31,31       /* _PAGE_RW -> PP lsb */
-       rlwinm  r2,r0,32-7,31,31        /* _PAGE_DIRTY -> PP lsb */
-       and     r1,r1,r2                /* writable if _RW and _DIRTY */
        rlwimi  r0,r0,32-1,30,30        /* _PAGE_USER -> PP msb */
-       rlwimi  r0,r0,32-1,31,31        /* _PAGE_USER -> PP lsb */
-       ori     r1,r1,0xe04             /* clear out reserved bits */
-       andc    r1,r0,r1                /* PP = user? (rw&dirty? 2: 3): 0 */
+       ori     r1, r1, 0xe05           /* clear out reserved bits */
+       andc    r1, r0, r1              /* PP = user? 2 : 0 */
 BEGIN_FTR_SECTION
        rlwinm  r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -576,16 +568,16 @@ DataLoadTLBMiss:
        mfspr   r3,SPRN_DMISS
        lis     r1,PAGE_OFFSET@h                /* check if kernel address */
        cmplw   0,r1,r3
-       mfspr   r2,SPRN_SPRG_THREAD
-       li      r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */
-       lwz     r2,PGDIR(r2)
+       mfspr   r2, SPRN_SPRG_PGDIR
+#ifdef CONFIG_SWAP
+       li      r1, _PAGE_PRESENT | _PAGE_ACCESSED
+#else
+       li      r1, _PAGE_PRESENT
+#endif
        bge-    112f
-       mfspr   r2,SPRN_SRR1            /* and MSR_PR bit from SRR1 */
-       rlwimi  r1,r2,32-12,29,29       /* shift MSR_PR to _PAGE_USER posn */
-       lis     r2,swapper_pg_dir@ha    /* if kernel address, use */
-       addi    r2,r2,swapper_pg_dir@l  /* kernel page table */
-112:   tophys(r2,r2)
-       rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
+       lis     r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, use */
+       addi    r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l        /* kernel page table */
+112:   rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
        lwz     r2,0(r2)                /* get pmd entry */
        rlwinm. r2,r2,0,0,19            /* extract address of pte page */
        beq-    DataAddressInvalid      /* return if no mapping */
@@ -593,20 +585,16 @@ DataLoadTLBMiss:
        lwz     r0,0(r2)                /* get linux-style pte */
        andc.   r1,r1,r0                /* check access & ~permission */
        bne-    DataAddressInvalid      /* return if access not permitted */
-       ori     r0,r0,_PAGE_ACCESSED    /* set _PAGE_ACCESSED in pte */
        /*
         * NOTE! We are assuming this is not an SMP system, otherwise
         * we would need to update the pte atomically with lwarx/stwcx.
         */
-       stw     r0,0(r2)                /* update PTE (accessed bit) */
        /* Convert linux-style PTE to low word of PPC-style PTE */
        rlwinm  r1,r0,32-10,31,31       /* _PAGE_RW -> PP lsb */
-       rlwinm  r2,r0,32-7,31,31        /* _PAGE_DIRTY -> PP lsb */
-       and     r1,r1,r2                /* writable if _RW and _DIRTY */
        rlwimi  r0,r0,32-1,30,30        /* _PAGE_USER -> PP msb */
        rlwimi  r0,r0,32-1,31,31        /* _PAGE_USER -> PP lsb */
        ori     r1,r1,0xe04             /* clear out reserved bits */
-       andc    r1,r0,r1                /* PP = user? (rw&dirty? 2: 3): 0 */
+       andc    r1,r0,r1                /* PP = user? rw? 2: 3: 0 */
 BEGIN_FTR_SECTION
        rlwinm  r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -660,16 +648,16 @@ DataStoreTLBMiss:
        mfspr   r3,SPRN_DMISS
        lis     r1,PAGE_OFFSET@h                /* check if kernel address */
        cmplw   0,r1,r3
-       mfspr   r2,SPRN_SPRG_THREAD
-       li      r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */
-       lwz     r2,PGDIR(r2)
+       mfspr   r2, SPRN_SPRG_PGDIR
+#ifdef CONFIG_SWAP
+       li      r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED
+#else
+       li      r1, _PAGE_RW | _PAGE_PRESENT
+#endif
        bge-    112f
-       mfspr   r2,SPRN_SRR1            /* and MSR_PR bit from SRR1 */
-       rlwimi  r1,r2,32-12,29,29       /* shift MSR_PR to _PAGE_USER posn */
-       lis     r2,swapper_pg_dir@ha    /* if kernel address, use */
-       addi    r2,r2,swapper_pg_dir@l  /* kernel page table */
-112:   tophys(r2,r2)
-       rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
+       lis     r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, use */
+       addi    r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l        /* kernel page table */
+112:   rlwimi  r2,r3,12,20,29          /* insert top 10 bits of address */
        lwz     r2,0(r2)                /* get pmd entry */
        rlwinm. r2,r2,0,0,19            /* extract address of pte page */
        beq-    DataAddressInvalid      /* return if no mapping */
@@ -677,12 +665,10 @@ DataStoreTLBMiss:
        lwz     r0,0(r2)                /* get linux-style pte */
        andc.   r1,r1,r0                /* check access & ~permission */
        bne-    DataAddressInvalid      /* return if access not permitted */
-       ori     r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY
        /*
         * NOTE! We are assuming this is not an SMP system, otherwise
         * we would need to update the pte atomically with lwarx/stwcx.
         */
-       stw     r0,0(r2)                /* update PTE (accessed/dirty bits) */
        /* Convert linux-style PTE to low word of PPC-style PTE */
        rlwimi  r0,r0,32-1,30,30        /* _PAGE_USER -> PP msb */
        li      r1,0xe05                /* clear out reserved bits & PP lsb */
@@ -845,12 +831,12 @@ __secondary_start:
        bl      init_idle_6xx
 #endif /* CONFIG_PPC_BOOK3S_32 */
 
-       /* get current_thread_info and current */
-       lis     r1,secondary_ti@ha
-       tophys(r1,r1)
-       lwz     r1,secondary_ti@l(r1)
-       tophys(r2,r1)
-       lwz     r2,TI_TASK(r2)
+       /* get current's stack and current */
+       lis     r2,secondary_current@ha
+       tophys(r2,r2)
+       lwz     r2,secondary_current@l(r2)
+       tophys(r1,r2)
+       lwz     r1,TASK_STACK(r1)
 
        /* stack */
        addi    r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
@@ -865,8 +851,10 @@ __secondary_start:
        tophys(r4,r2)
        addi    r4,r4,THREAD    /* phys address of our thread_struct */
        mtspr   SPRN_SPRG_THREAD,r4
+#ifdef CONFIG_PPC_RTAS
        li      r3,0
-       mtspr   SPRN_SPRG_RTAS,r3       /* 0 => not in RTAS */
+       stw     r3, RTAS_SP(r4)         /* 0 => not in RTAS */
+#endif
 
        /* enable MMU and jump to start_secondary */
        li      r4,MSR_KERNEL
@@ -950,8 +938,10 @@ start_here:
        tophys(r4,r2)
        addi    r4,r4,THREAD    /* init task's THREAD */
        mtspr   SPRN_SPRG_THREAD,r4
+#ifdef CONFIG_PPC_RTAS
        li      r3,0
-       mtspr   SPRN_SPRG_RTAS,r3       /* 0 => not in RTAS */
+       stw     r3, RTAS_SP(r4)         /* 0 => not in RTAS */
+#endif
 
        /* stack */
        lis     r1,init_thread_union@ha
@@ -1022,15 +1012,16 @@ _ENTRY(switch_mmu_context)
        li      r0,NUM_USER_SEGMENTS
        mtctr   r0
 
+       lwz     r4, MM_PGD(r4)
 #ifdef CONFIG_BDI_SWITCH
        /* Context switch the PTE pointer for the Abatron BDI2000.
         * The PGDIR is passed as second argument.
         */
-       lwz     r4,MM_PGD(r4)
-       lis     r5, KERNELBASE@h
-       lwz     r5, 0xf0(r5)
-       stw     r4, 0x4(r5)
+       lis     r5, abatron_pteptrs@ha
+       stw     r4, abatron_pteptrs@l + 0x4(r5)
 #endif
+       tophys(r4, r4)
+       mtspr   SPRN_SPRG_PGDIR, r4
        li      r4,0
        isync
 3:
@@ -1105,6 +1096,41 @@ BEGIN_MMU_FTR_SECTION
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
        blr
 
+_ENTRY(update_bats)
+       lis     r4, 1f@h
+       ori     r4, r4, 1f@l
+       tophys(r4, r4)
+       mfmsr   r6
+       mflr    r7
+       li      r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)
+       rlwinm  r0, r6, 0, ~MSR_RI
+       rlwinm  r0, r0, 0, ~MSR_EE
+       mtmsr   r0
+       mtspr   SPRN_SRR0, r4
+       mtspr   SPRN_SRR1, r3
+       SYNC
+       RFI
+1:     bl      clear_bats
+       lis     r3, BATS@ha
+       addi    r3, r3, BATS@l
+       tophys(r3, r3)
+       LOAD_BAT(0, r3, r4, r5)
+       LOAD_BAT(1, r3, r4, r5)
+       LOAD_BAT(2, r3, r4, r5)
+       LOAD_BAT(3, r3, r4, r5)
+BEGIN_MMU_FTR_SECTION
+       LOAD_BAT(4, r3, r4, r5)
+       LOAD_BAT(5, r3, r4, r5)
+       LOAD_BAT(6, r3, r4, r5)
+       LOAD_BAT(7, r3, r4, r5)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+       li      r3, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI)
+       mtmsr   r3
+       mtspr   SPRN_SRR0, r7
+       mtspr   SPRN_SRR1, r6
+       SYNC
+       RFI
+
 flush_tlbs:
        lis     r10, 0x40
 1:     addic.  r10, r10, -0x1000
index b19d784105118dff99297edd3b90d8e3f3120ef2..a9c934f2319b5418dbff6b913a4e4e1635ca5a92 100644 (file)
@@ -115,7 +115,7 @@ _ENTRY(saved_ksp_limit)
        andi.   r11,r11,MSR_PR;                                              \
        beq     1f;                                                          \
        mfspr   r1,SPRN_SPRG_THREAD;    /* if from user, start at top of   */\
-       lwz     r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack   */\
+       lwz     r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack   */\
        addi    r1,r1,THREAD_SIZE;                                           \
 1:     subi    r1,r1,INT_FRAME_SIZE;   /* Allocate an exception frame     */\
        tophys(r11,r1);                                                      \
@@ -158,7 +158,7 @@ _ENTRY(saved_ksp_limit)
        beq     1f;                                                          \
        /* COMING FROM USER MODE */                                          \
        mfspr   r11,SPRN_SPRG_THREAD;   /* if from user, start at top of   */\
-       lwz     r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
+       lwz     r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\
 1:     addi    r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm  */\
        tophys(r11,r11);                                                     \
        stw     r10,_CCR(r11);          /* save various registers          */\
@@ -953,9 +953,8 @@ _GLOBAL(set_context)
        /* Context switch the PTE pointer for the Abatron BDI2000.
         * The PGDIR is the second parameter.
         */
-       lis     r5, KERNELBASE@h
-       lwz     r5, 0xf0(r5)
-       stw     r4, 0x4(r5)
+       lis     r5, abatron_pteptrs@ha
+       stw     r4, abatron_pteptrs@l + 0x4(r5)
 #endif
        sync
        mtspr   SPRN_PID,r3
index bf23c19c92d6afecc289317b4dcfcb9b08d752a1..37117ab11584c7a06a9c2ccc86b2fd3a70f278c4 100644 (file)
@@ -1019,10 +1019,10 @@ _GLOBAL(start_secondary_47x)
 
        /* Now we can get our task struct and real stack pointer */
 
-       /* Get current_thread_info and current */
-       lis     r1,secondary_ti@ha
-       lwz     r1,secondary_ti@l(r1)
-       lwz     r2,TI_TASK(r1)
+       /* Get current's stack and current */
+       lis     r2,secondary_current@ha
+       lwz     r2,secondary_current@l(r2)
+       lwz     r1,TASK_STACK(r2)
 
        /* Current stack pointer */
        addi    r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
index 4898e9491a1cd3ab5b90b2e50c75758b7ebb9d00..3fad8d499767ceece336fae6e80313c736cefd35 100644 (file)
@@ -801,21 +801,19 @@ __secondary_start:
        /* Set thread priority to MEDIUM */
        HMT_MEDIUM
 
-       /* Initialize the kernel stack */
-       LOAD_REG_ADDR(r3, current_set)
-       sldi    r28,r24,3               /* get current_set[cpu#]         */
-       ldx     r14,r3,r28
-       addi    r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD
-       std     r14,PACAKSAVE(r13)
-
-       /* Do early setup for that CPU (SLB and hash table pointer) */
+       /*
+        * Do early setup for this CPU, in particular initialising the MMU so we
+        * can turn it on below. This is a call to C, which is OK, we're still
+        * running on the emergency stack.
+        */
        bl      early_setup_secondary
 
        /*
-        * setup the new stack pointer, but *don't* use this until
-        * translation is on.
+        * The primary has initialized our kernel stack for us in the paca, grab
+        * it and put it in r1. We must *not* use it until we turn on the MMU
+        * below, because it may not be inside the RMO.
         */
-       mr      r1, r14
+       ld      r1, PACAKSAVE(r13)
 
        /* Clear backchain so we get nice backtraces */
        li      r7,0
index 20cc816b3508d3f689a11ebefebaf9fc2a711ae2..03c73b4c6435974278f0f3a07ec20981704403ce 100644 (file)
@@ -142,7 +142,7 @@ instruction_counter:
        tophys(r11,r1);                 /* use tophys(r1) if kernel */ \
        beq     1f;             \
        mfspr   r11,SPRN_SPRG_THREAD;   \
-       lwz     r11,THREAD_INFO-THREAD(r11);    \
+       lwz     r11,TASK_STACK-THREAD(r11);     \
        addi    r11,r11,THREAD_SIZE;    \
        tophys(r11,r11);        \
 1:     subi    r11,r11,INT_FRAME_SIZE  /* alloc exc. frame */
@@ -292,6 +292,17 @@ SystemCall:
  */
        EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
 
+/* Called from DataStoreTLBMiss when perf TLB misses events are activated */
+#ifdef CONFIG_PERF_EVENTS
+       patch_site      0f, patch__dtlbmiss_perf
+0:     lwz     r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
+       addi    r10, r10, 1
+       stw     r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
+       mfspr   r10, SPRN_SPRG_SCRATCH0
+       mfspr   r11, SPRN_SPRG_SCRATCH1
+       rfi
+#endif
+
        . = 0x1100
 /*
  * For the MPC8xx, this is a software tablewalk to load the instruction
@@ -337,8 +348,8 @@ InstructionTLBMiss:
        rlwinm  r10, r10, 16, 0xfff8
        cmpli   cr0, r10, PAGE_OFFSET@h
 #ifndef CONFIG_PIN_TLB_TEXT
-       /* It is assumed that kernel code fits into the first 8M page */
-0:     cmpli   cr7, r10, (PAGE_OFFSET + 0x0800000)@h
+       /* It is assumed that kernel code fits into the first 32M */
+0:     cmpli   cr7, r10, (PAGE_OFFSET + 0x2000000)@h
        patch_site      0b, patch__itlbmiss_linmem_top
 #endif
 #endif
@@ -405,10 +416,20 @@ InstructionTLBMiss:
 #ifndef CONFIG_PIN_TLB_TEXT
 ITLBMissLinear:
        mtcr    r11
+#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23
+       patch_site      0f, patch__itlbmiss_linmem_top8
+
+       mfspr   r10, SPRN_SRR0
+0:     subis   r11, r10, (PAGE_OFFSET - 0x80000000)@ha
+       rlwinm  r11, r11, 4, MI_PS8MEG ^ MI_PS512K
+       ori     r11, r11, MI_PS512K | MI_SVALID
+       rlwinm  r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */
+#else
        /* Set 8M byte page and mark it valid */
        li      r11, MI_PS8MEG | MI_SVALID
-       mtspr   SPRN_MI_TWC, r11
        rlwinm  r10, r10, 20, 0x0f800000        /* 8xx supports max 256Mb RAM */
+#endif
+       mtspr   SPRN_MI_TWC, r11
        ori     r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
                          _PAGE_PRESENT
        mtspr   SPRN_MI_RPN, r10        /* Update TLB entry */
@@ -434,7 +455,7 @@ DataStoreTLBMiss:
 #ifndef CONFIG_PIN_TLB_IMMR
        cmpli   cr6, r10, VIRT_IMMR_BASE@h
 #endif
-0:     cmpli   cr7, r10, (PAGE_OFFSET + 0x1800000)@h
+0:     cmpli   cr7, r10, (PAGE_OFFSET + 0x2000000)@h
        patch_site      0b, patch__dtlbmiss_linmem_top
 
        mfspr   r10, SPRN_M_TWB /* Get level 1 table */
@@ -494,16 +515,6 @@ DataStoreTLBMiss:
        rfi
        patch_site      0b, patch__dtlbmiss_exit_1
 
-#ifdef CONFIG_PERF_EVENTS
-       patch_site      0f, patch__dtlbmiss_perf
-0:     lwz     r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
-       addi    r10, r10, 1
-       stw     r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
-       mfspr   r10, SPRN_SPRG_SCRATCH0
-       mfspr   r11, SPRN_SPRG_SCRATCH1
-       rfi
-#endif
-
 DTLBMissIMMR:
        mtcr    r11
        /* Set 512k byte guarded page and mark it valid */
@@ -525,10 +536,29 @@ DTLBMissIMMR:
 
 DTLBMissLinear:
        mtcr    r11
+       rlwinm  r10, r10, 20, 0x0f800000        /* 8xx supports max 256Mb RAM */
+#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23
+       patch_site      0f, patch__dtlbmiss_romem_top8
+
+0:     subis   r11, r10, (PAGE_OFFSET - 0x80000000)@ha
+       rlwinm  r11, r11, 0, 0xff800000
+       neg     r10, r11
+       or      r11, r11, r10
+       rlwinm  r11, r11, 4, MI_PS8MEG ^ MI_PS512K
+       ori     r11, r11, MI_PS512K | MI_SVALID
+       mfspr   r10, SPRN_MD_EPN
+       rlwinm  r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */
+#else
        /* Set 8M byte page and mark it valid */
        li      r11, MD_PS8MEG | MD_SVALID
+#endif
        mtspr   SPRN_MD_TWC, r11
-       rlwinm  r10, r10, 20, 0x0f800000        /* 8xx supports max 256Mb RAM */
+#ifdef CONFIG_STRICT_KERNEL_RWX
+       patch_site      0f, patch__dtlbmiss_romem_top
+
+0:     subis   r11, r10, 0
+       rlwimi  r10, r11, 11, _PAGE_RO
+#endif
        ori     r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
                          _PAGE_PRESENT
        mtspr   SPRN_MD_RPN, r10        /* Update TLB entry */
@@ -551,11 +581,11 @@ InstructionTLBError:
        mr      r4,r12
        andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
        andis.  r10,r9,SRR1_ISI_NOPT@h
-       beq+    1f
+       beq+    .Litlbie
        tlbie   r4
-itlbie:
        /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
-1:     EXC_XFER_LITE(0x400, handle_page_fault)
+.Litlbie:
+       EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* This is the data TLB error on the MPC8xx.  This could be due to
  * many reasons, including a dirty update to a pte.  We bail out to
@@ -577,10 +607,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */
        stw     r5,_DSISR(r11)
        mfspr   r4,SPRN_DAR
        andis.  r10,r5,DSISR_NOHPTE@h
-       beq+    1f
+       beq+    .Ldtlbie
        tlbie   r4
-dtlbie:
-1:     li      r10,RPN_PATTERN
+.Ldtlbie:
+       li      r10,RPN_PATTERN
        mtspr   SPRN_DAR,r10    /* Tag DAR, to be used in DTLB Error */
        /* 0x300 is DataAccess exception, needed by bad_page_fault() */
        EXC_XFER_LITE(0x300, handle_page_fault)
@@ -603,8 +633,8 @@ DataBreakpoint:
        mtspr   SPRN_SPRG_SCRATCH1, r11
        mfcr    r10
        mfspr   r11, SPRN_SRR0
-       cmplwi  cr0, r11, (dtlbie - PAGE_OFFSET)@l
-       cmplwi  cr7, r11, (itlbie - PAGE_OFFSET)@l
+       cmplwi  cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l
+       cmplwi  cr7, r11, (.Litlbie - PAGE_OFFSET)@l
        beq-    cr0, 11f
        beq-    cr7, 11f
        EXCEPTION_PROLOG_1
@@ -886,28 +916,11 @@ initial_mmu:
        mtspr   SPRN_MD_CTR, r10        /* remove PINNED DTLB entries */
 
        tlbia                   /* Invalidate all TLB entries */
-#ifdef CONFIG_PIN_TLB_TEXT
-       lis     r8, MI_RSV4I@h
-       ori     r8, r8, 0x1c00
-
-       mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
-#endif
-
 #ifdef CONFIG_PIN_TLB_DATA
        oris    r10, r10, MD_RSV4I@h
        mtspr   SPRN_MD_CTR, r10        /* Set data TLB control */
 #endif
 
-       /* Now map the lower 8 Meg into the ITLB. */
-       lis     r8, KERNELBASE@h        /* Create vaddr for TLB */
-       ori     r8, r8, MI_EVALID       /* Mark it valid */
-       mtspr   SPRN_MI_EPN, r8
-       li      r8, MI_PS8MEG /* Set 8M byte page */
-       ori     r8, r8, MI_SVALID       /* Make it valid */
-       mtspr   SPRN_MI_TWC, r8
-       li      r8, MI_BOOTINIT         /* Create RPN for address 0 */
-       mtspr   SPRN_MI_RPN, r8         /* Store TLB entry */
-
        lis     r8, MI_APG_INIT@h       /* Set protection modes */
        ori     r8, r8, MI_APG_INIT@l
        mtspr   SPRN_MI_AP, r8
@@ -937,6 +950,34 @@ initial_mmu:
        mtspr   SPRN_MD_RPN, r8
 #endif
 
+       /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */
+#ifdef CONFIG_PIN_TLB_TEXT
+       lis     r8, MI_RSV4I@h
+       ori     r8, r8, 0x1c00
+#endif
+       li      r9, 4                           /* up to 4 pages of 8M */
+       mtctr   r9
+       lis     r9, KERNELBASE@h                /* Create vaddr for TLB */
+       li      r10, MI_PS8MEG | MI_SVALID      /* Set 8M byte page */
+       li      r11, MI_BOOTINIT                /* Create RPN for address 0 */
+       lis     r12, _einittext@h
+       ori     r12, r12, _einittext@l
+1:
+#ifdef CONFIG_PIN_TLB_TEXT
+       mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
+       addi    r8, r8, 0x100
+#endif
+
+       ori     r0, r9, MI_EVALID               /* Mark it valid */
+       mtspr   SPRN_MI_EPN, r0
+       mtspr   SPRN_MI_TWC, r10
+       mtspr   SPRN_MI_RPN, r11                /* Store TLB entry */
+       addis   r9, r9, 0x80
+       addis   r11, r11, 0x80
+
+       cmpl    cr0, r9, r12
+       bdnzf   gt, 1b
+
        /* Since the cache is enabled according to the information we
         * just loaded into the TLB, invalidate and enable the caches here.
         * We should probably check/set other modes....later.
@@ -989,5 +1030,6 @@ swapper_pg_dir:
 /* Room for two PTE table poiners, usually the kernel and current user
  * pointer to their respective root page table (pgdir).
  */
+       .globl  abatron_pteptrs
 abatron_pteptrs:
        .space  8
index 306e26c073a043b91d86f42c25075a2771760bf0..1b22a8dea399687b6692b94186d377212e544aea 100644 (file)
@@ -55,7 +55,7 @@ END_BTB_FLUSH_SECTION
        beq     1f;                                                          \
        BOOKE_CLEAR_BTB(r11)                                            \
        /* if from user, start at top of this thread's kernel stack */       \
-       lwz     r11, THREAD_INFO-THREAD(r10);                                \
+       lwz     r11, TASK_STACK - THREAD(r10);                               \
        ALLOC_STACK_FRAME(r11, THREAD_SIZE);                                 \
 1 :    subi    r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */     \
        stw     r13, _CCR(r11);         /* save various registers */         \
@@ -142,7 +142,7 @@ END_BTB_FLUSH_SECTION
        BOOKE_CLEAR_BTB(r10)                                            \
        andi.   r11,r11,MSR_PR;                                              \
        mfspr   r11,SPRN_SPRG_THREAD;   /* if from user, start at top of   */\
-       lwz     r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
+       lwz     r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\
        addi    r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame    */\
        beq     1f;                                                          \
        /* COMING FROM USER MODE */                                          \
@@ -155,13 +155,7 @@ END_BTB_FLUSH_SECTION
        stw     r10,GPR11(r11);                                              \
        b       2f;                                                          \
        /* COMING FROM PRIV MODE */                                          \
-1:     lwz     r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11);                     \
-       lwz     r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11);                  \
-       stw     r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8);                      \
-       stw     r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8);                   \
-       lwz     r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11);                      \
-       stw     r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8);                       \
-       mr      r11,r8;                                                      \
+1:     mr      r11, r8;                                                             \
 2:     mfspr   r8,SPRN_SPRG_RSCRATCH_##exc_level;                           \
        stw     r12,GPR12(r11);         /* save various registers          */\
        mflr    r10;                                                         \
index 2386ce2a9c6e4604ecc5a8f245858d3f0c547b6b..1881127682e995f5c6c94eb3d90d00e2559f1582 100644 (file)
@@ -243,8 +243,9 @@ set_ivor:
        li      r0,0
        stwu    r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
 
-       CURRENT_THREAD_INFO(r22, r1)
-       stw     r24, TI_CPU(r22)
+#ifdef CONFIG_SMP
+       stw     r24, TASK_CPU(r2)
+#endif
 
        bl      early_init
 
@@ -717,8 +718,7 @@ finish_tlb_load:
 
        /* Get the next_tlbcam_idx percpu var */
 #ifdef CONFIG_SMP
-       lwz     r12, THREAD_INFO-THREAD(r12)
-       lwz     r15, TI_CPU(r12)
+       lwz     r15, TASK_CPU-THREAD(r12)
        lis     r14, __per_cpu_offset@h
        ori     r14, r14, __per_cpu_offset@l
        rlwinm  r15, r15, 2, 0, 29
@@ -1089,10 +1089,10 @@ __secondary_start:
        mr      r4,r24          /* Why? */
        bl      call_setup_cpu
 
-       /* get current_thread_info and current */
-       lis     r1,secondary_ti@ha
-       lwz     r1,secondary_ti@l(r1)
-       lwz     r2,TI_TASK(r1)
+       /* get current's stack and current */
+       lis     r2,secondary_current@ha
+       lwz     r2,secondary_current@l(r2)
+       lwz     r1,TASK_STACK(r2)
 
        /* stack */
        addi    r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
index ff026c9d3cab42c3812d33fdc7d23cfbc5d1406b..c5e7f5bb2e662bbf431294b537e65e85f36ac45a 100644 (file)
@@ -136,10 +136,9 @@ BEGIN_FTR_SECTION
        DSSALL
        sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-       CURRENT_THREAD_INFO(r9, r1)
-       lwz     r8,TI_LOCAL_FLAGS(r9)   /* set napping bit */
+       lwz     r8,TI_LOCAL_FLAGS(r2)   /* set napping bit */
        ori     r8,r8,_TLF_NAPPING      /* so when we take an exception */
-       stw     r8,TI_LOCAL_FLAGS(r9)   /* it will return to our caller */
+       stw     r8,TI_LOCAL_FLAGS(r2)   /* it will return to our caller */
        mfmsr   r7
        ori     r7,r7,MSR_EE
        oris    r7,r7,MSR_POW@h
@@ -159,8 +158,7 @@ _GLOBAL(power_save_ppc32_restore)
        stw     r9,_NIP(r11)            /* make it do a blr */
 
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r12, r11)
-       lwz     r11,TI_CPU(r12)         /* get cpu number * 4 */
+       lwz     r11,TASK_CPU(r2)        /* get cpu number * 4 */
        slwi    r11,r11,2
 #else
        li      r11,0
index 4e0d94d02030afb2a59da4898a6d90da9e54c457..31e732c378ad292cb717d94910a7225867b1fb57 100644 (file)
@@ -63,7 +63,7 @@ _GLOBAL(\name)
 1:     /* Let's set the _TLF_NAPPING flag so interrupts make us return
         * to the right spot
        */
-       CURRENT_THREAD_INFO(r11, r1)
+       ld      r11, PACACURRENT(r13)
        ld      r10,TI_LOCAL_FLAGS(r11)
        ori     r10,r10,_TLF_NAPPING
        std     r10,TI_LOCAL_FLAGS(r11)
index 583e55ac7d26319ab38205d801211625a5809dbd..69dfcd2ca0118d84e68a5218322b6575ea21a203 100644 (file)
        .text
 
 _GLOBAL(e500_idle)
-       CURRENT_THREAD_INFO(r3, r1)
-       lwz     r4,TI_LOCAL_FLAGS(r3)   /* set napping bit */
+       lwz     r4,TI_LOCAL_FLAGS(r2)   /* set napping bit */
        ori     r4,r4,_TLF_NAPPING      /* so when we take an exception */
-       stw     r4,TI_LOCAL_FLAGS(r3)   /* it will return to our caller */
+       stw     r4,TI_LOCAL_FLAGS(r2)   /* it will return to our caller */
 
 #ifdef CONFIG_PPC_E500MC
        wrteei  1
@@ -88,8 +87,7 @@ _GLOBAL(power_save_ppc32_restore)
        stw     r9,_NIP(r11)            /* make it do a blr */
 
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r12, r1)
-       lwz     r11,TI_CPU(r12)         /* get cpu number * 4 */
+       lwz     r11,TASK_CPU(r2)                /* get cpu number * 4 */
        slwi    r11,r11,2
 #else
        li      r11,0
index a09b3c7ca176e1d52b3f0c8c538cee8be3abcf29..a2fdb0a34b750d419f61af6e71e1560d6a659822 100644 (file)
@@ -68,7 +68,7 @@ BEGIN_FTR_SECTION
        DSSALL
        sync
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
-       CURRENT_THREAD_INFO(r9, r1)
+       ld      r9, PACA_THREAD_INFO(r13)
        ld      r8,TI_LOCAL_FLAGS(r9)   /* set napping bit */
        ori     r8,r8,_TLF_NAPPING      /* so when we take an exception */
        std     r8,TI_LOCAL_FLAGS(r9)   /* it will return to our caller */
index 916ddc4aac443985da55fc38997d75a146401182..8a936723c791b6f32af4c47fd7bd86f50041674c 100644 (file)
@@ -618,9 +618,8 @@ static inline void check_stack_overflow(void)
        sp = current_stack_pointer() & (THREAD_SIZE-1);
 
        /* check for stack overflow: is there less than 2KB free? */
-       if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
-               pr_err("do_IRQ: stack overflow: %ld\n",
-                       sp - sizeof(struct thread_info));
+       if (unlikely(sp < 2048)) {
+               pr_err("do_IRQ: stack overflow: %ld\n", sp);
                dump_stack();
        }
 #endif
@@ -660,36 +659,21 @@ void __do_irq(struct pt_regs *regs)
 void do_IRQ(struct pt_regs *regs)
 {
        struct pt_regs *old_regs = set_irq_regs(regs);
-       struct thread_info *curtp, *irqtp, *sirqtp;
+       void *cursp, *irqsp, *sirqsp;
 
        /* Switch to the irq stack to handle this */
-       curtp = current_thread_info();
-       irqtp = hardirq_ctx[raw_smp_processor_id()];
-       sirqtp = softirq_ctx[raw_smp_processor_id()];
+       cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
+       irqsp = hardirq_ctx[raw_smp_processor_id()];
+       sirqsp = softirq_ctx[raw_smp_processor_id()];
 
        /* Already there ? */
-       if (unlikely(curtp == irqtp || curtp == sirqtp)) {
+       if (unlikely(cursp == irqsp || cursp == sirqsp)) {
                __do_irq(regs);
                set_irq_regs(old_regs);
                return;
        }
-
-       /* Prepare the thread_info in the irq stack */
-       irqtp->task = curtp->task;
-       irqtp->flags = 0;
-
-       /* Copy the preempt_count so that the [soft]irq checks work. */
-       irqtp->preempt_count = curtp->preempt_count;
-
        /* Switch stack and call */
-       call_do_irq(regs, irqtp);
-
-       /* Restore stack limit */
-       irqtp->task = NULL;
-
-       /* Copy back updates to the thread_info */
-       if (irqtp->flags)
-               set_bits(irqtp->flags, &curtp->flags);
+       call_do_irq(regs, irqsp);
 
        set_irq_regs(old_regs);
 }
@@ -698,90 +682,20 @@ void __init init_IRQ(void)
 {
        if (ppc_md.init_IRQ)
                ppc_md.init_IRQ();
-
-       exc_lvl_ctx_init();
-
-       irq_ctx_init();
 }
 
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
-struct thread_info   *critirq_ctx[NR_CPUS] __read_mostly;
-struct thread_info    *dbgirq_ctx[NR_CPUS] __read_mostly;
-struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
-
-void exc_lvl_ctx_init(void)
-{
-       struct thread_info *tp;
-       int i, cpu_nr;
-
-       for_each_possible_cpu(i) {
-#ifdef CONFIG_PPC64
-               cpu_nr = i;
-#else
-#ifdef CONFIG_SMP
-               cpu_nr = get_hard_smp_processor_id(i);
-#else
-               cpu_nr = 0;
-#endif
+void   *critirq_ctx[NR_CPUS] __read_mostly;
+void    *dbgirq_ctx[NR_CPUS] __read_mostly;
+void *mcheckirq_ctx[NR_CPUS] __read_mostly;
 #endif
 
-               memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
-               tp = critirq_ctx[cpu_nr];
-               tp->cpu = cpu_nr;
-               tp->preempt_count = 0;
-
-#ifdef CONFIG_BOOKE
-               memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE);
-               tp = dbgirq_ctx[cpu_nr];
-               tp->cpu = cpu_nr;
-               tp->preempt_count = 0;
-
-               memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE);
-               tp = mcheckirq_ctx[cpu_nr];
-               tp->cpu = cpu_nr;
-               tp->preempt_count = HARDIRQ_OFFSET;
-#endif
-       }
-}
-#endif
-
-struct thread_info *softirq_ctx[NR_CPUS] __read_mostly;
-struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly;
-
-void irq_ctx_init(void)
-{
-       struct thread_info *tp;
-       int i;
-
-       for_each_possible_cpu(i) {
-               memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
-               tp = softirq_ctx[i];
-               tp->cpu = i;
-               klp_init_thread_info(tp);
-
-               memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
-               tp = hardirq_ctx[i];
-               tp->cpu = i;
-               klp_init_thread_info(tp);
-       }
-}
+void *softirq_ctx[NR_CPUS] __read_mostly;
+void *hardirq_ctx[NR_CPUS] __read_mostly;
 
 void do_softirq_own_stack(void)
 {
-       struct thread_info *curtp, *irqtp;
-
-       curtp = current_thread_info();
-       irqtp = softirq_ctx[smp_processor_id()];
-       irqtp->task = curtp->task;
-       irqtp->flags = 0;
-       call_do_softirq(irqtp);
-       irqtp->task = NULL;
-
-       /* Set any flag that may have been set on the
-        * alternate stack
-        */
-       if (irqtp->flags)
-               set_bits(irqtp->flags, &curtp->flags);
+       call_do_softirq(softirq_ctx[smp_processor_id()]);
 }
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
@@ -827,11 +741,6 @@ int irq_choose_cpu(const struct cpumask *mask)
 }
 #endif
 
-int arch_early_irq_init(void)
-{
-       return 0;
-}
-
 #ifdef CONFIG_PPC64
 static int __init setup_noirqdistrib(char *str)
 {
index e1865565f0aeead6284fa396614faf56143f9f9b..7dd55eb1259dc15f9d1204fb848d9a395f0b3d91 100644 (file)
@@ -151,41 +151,13 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
        return 1;
 }
 
-static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
 static int kgdb_singlestep(struct pt_regs *regs)
 {
-       struct thread_info *thread_info, *exception_thread_info;
-       struct thread_info *backup_current_thread_info =
-               this_cpu_ptr(&kgdb_thread_info);
-
        if (user_mode(regs))
                return 0;
 
-       /*
-        * On Book E and perhaps other processors, singlestep is handled on
-        * the critical exception stack.  This causes current_thread_info()
-        * to fail, since it it locates the thread_info by masking off
-        * the low bits of the current stack pointer.  We work around
-        * this issue by copying the thread_info from the kernel stack
-        * before calling kgdb_handle_exception, and copying it back
-        * afterwards.  On most processors the copy is avoided since
-        * exception_thread_info == thread_info.
-        */
-       thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
-       exception_thread_info = current_thread_info();
-
-       if (thread_info != exception_thread_info) {
-               /* Save the original current_thread_info. */
-               memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info);
-               memcpy(exception_thread_info, thread_info, sizeof *thread_info);
-       }
-
        kgdb_handle_exception(0, SIGTRAP, 0, regs);
 
-       if (thread_info != exception_thread_info)
-               /* Restore current_thread_info lastly. */
-               memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
-
        return 1;
 }
 
index a0f6f45005bd42c9dfa94e608bd2dabbab2f99bd..75692c327ba0932bcceb5aa6e1e7912360982355 100644 (file)
@@ -317,10 +317,8 @@ void default_machine_kexec(struct kimage *image)
         * We setup preempt_count to avoid using VMX in memcpy.
         * XXX: the task struct will likely be invalid once we do the copy!
         */
-       kexec_stack.thread_info.task = current_thread_info()->task;
-       kexec_stack.thread_info.flags = 0;
-       kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
-       kexec_stack.thread_info.cpu = current_thread_info()->cpu;
+       current_thread_info()->flags = 0;
+       current_thread_info()->preempt_count = HARDIRQ_OFFSET;
 
        /* We need a static PACA, too; copy this CPU's PACA over and switch to
         * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
index bd933a75f0bcbe6dda0ede7abefe69038ada36e4..b5fec1f9751a13a04d2b280c6e84c3eaba8dbd47 100644 (file)
@@ -31,6 +31,7 @@
 
 #include <asm/machdep.h>
 #include <asm/mce.h>
+#include <asm/nmi.h>
 
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -301,13 +302,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
        while (__this_cpu_read(mce_queue_count) > 0) {
                index = __this_cpu_read(mce_queue_count) - 1;
                evt = this_cpu_ptr(&mce_event_queue[index]);
-               machine_check_print_event_info(evt, false);
+               machine_check_print_event_info(evt, false, false);
                __this_cpu_dec(mce_queue_count);
        }
 }
 
 void machine_check_print_event_info(struct machine_check_event *evt,
-                                   bool user_mode)
+                                   bool user_mode, bool in_guest)
 {
        const char *level, *sevstr, *subtype;
        static const char *mc_ue_types[] = {
@@ -387,7 +388,9 @@ void machine_check_print_event_info(struct machine_check_event *evt,
               evt->disposition == MCE_DISPOSITION_RECOVERED ?
               "Recovered" : "Not recovered");
 
-       if (user_mode) {
+       if (in_guest) {
+               printk("%s  Guest NIP: %016llx\n", level, evt->srr0);
+       } else if (user_mode) {
                printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
                        evt->srr0, current->pid, current->comm);
        } else {
@@ -488,6 +491,8 @@ long machine_check_early(struct pt_regs *regs)
 {
        long handled = 0;
 
+       hv_nmi_check_nonrecoverable(regs);
+
        /*
         * See if platform is capable of handling machine check.
         */
index 57d2ffb2d45c35196584edbd9a221fb38cd3d2f9..0dda4f8e3d7a2c5d9017f28e2cdc269e25ba876b 100644 (file)
@@ -46,11 +46,10 @@ _GLOBAL(call_do_softirq)
        mflr    r0
        stw     r0,4(r1)
        lwz     r10,THREAD+KSP_LIMIT(r2)
-       addi    r11,r3,THREAD_INFO_GAP
+       stw     r3, THREAD+KSP_LIMIT(r2)
        stwu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
        mr      r1,r3
        stw     r10,8(r1)
-       stw     r11,THREAD+KSP_LIMIT(r2)
        bl      __do_softirq
        lwz     r10,8(r1)
        lwz     r1,0(r1)
@@ -60,17 +59,16 @@ _GLOBAL(call_do_softirq)
        blr
 
 /*
- * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+ * void call_do_irq(struct pt_regs *regs, void *sp);
  */
 _GLOBAL(call_do_irq)
        mflr    r0
        stw     r0,4(r1)
        lwz     r10,THREAD+KSP_LIMIT(r2)
-       addi    r11,r4,THREAD_INFO_GAP
+       stw     r4, THREAD+KSP_LIMIT(r2)
        stwu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
        mr      r1,r4
        stw     r10,8(r1)
-       stw     r11,THREAD+KSP_LIMIT(r2)
        bl      __do_irq
        lwz     r10,8(r1)
        lwz     r1,0(r1)
@@ -183,10 +181,13 @@ _GLOBAL(low_choose_750fx_pll)
        or      r4,r4,r5
        mtspr   SPRN_HID1,r4
 
+#ifdef CONFIG_SMP
        /* Store new HID1 image */
-       CURRENT_THREAD_INFO(r6, r1)
-       lwz     r6,TI_CPU(r6)
+       lwz     r6,TASK_CPU(r2)
        slwi    r6,r6,2
+#else
+       li      r6, 0
+#endif
        addis   r6,r6,nap_save_hid1@ha
        stw     r4,nap_save_hid1@l(r6)
 
@@ -599,7 +600,7 @@ EXPORT_SYMBOL(__bswapdi2)
 #ifdef CONFIG_SMP
 _GLOBAL(start_secondary_resume)
        /* Reset stack */
-       CURRENT_THREAD_INFO(r1, r1)
+       rlwinm  r1, r1, 0, 0, 31 - THREAD_SHIFT
        addi    r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
        li      r3,0
        stw     r3,0(r1)                /* Zero the stack frame pointer */
index 4538e8ddde807fc11e2392e4263ff13287147b1e..ff4b7539cbdfdee06ef409780f908c62fa3b2ee4 100644 (file)
@@ -63,19 +63,13 @@ resource_size_t isa_mem_base;
 EXPORT_SYMBOL(isa_mem_base);
 
 
-static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops;
+static const struct dma_map_ops *pci_dma_ops;
 
 void set_pci_dma_ops(const struct dma_map_ops *dma_ops)
 {
        pci_dma_ops = dma_ops;
 }
 
-const struct dma_map_ops *get_pci_dma_ops(void)
-{
-       return pci_dma_ops;
-}
-EXPORT_SYMBOL(get_pci_dma_ops);
-
 /*
  * This function should run under locking protection, specifically
  * hose_spinlock.
@@ -358,6 +352,17 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node)
        return NULL;
 }
 
+struct pci_controller *pci_find_controller_for_domain(int domain_nr)
+{
+       struct pci_controller *hose;
+
+       list_for_each_entry(hose, &hose_list, list_node)
+               if (hose->global_number == domain_nr)
+                       return hose;
+
+       return NULL;
+}
+
 /*
  * Reads the interrupt pin to determine if interrupt is use by card.
  * If the interrupt is used, then gets the interrupt line from the
@@ -973,7 +978,7 @@ static void pcibios_setup_device(struct pci_dev *dev)
 
        /* Hook up default DMA ops */
        set_dma_ops(&dev->dev, pci_dma_ops);
-       set_dma_offset(&dev->dev, PCI_DRAM_OFFSET);
+       dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET;
 
        /* Additional platform DMA/iommu setup */
        phb = pci_bus_to_host(dev->bus);
index ce393df243aa3a642935c78e37a943e58cddf1fe..dd9e0d5386ee7030fe4539ef67e0b3a4d6e3eb46 100644 (file)
@@ -176,7 +176,7 @@ static void __giveup_fpu(struct task_struct *tsk)
 
        save_fpu(tsk);
        msr = tsk->thread.regs->msr;
-       msr &= ~MSR_FP;
+       msr &= ~(MSR_FP|MSR_FE0|MSR_FE1);
 #ifdef CONFIG_VSX
        if (cpu_has_feature(CPU_FTR_VSX))
                msr &= ~MSR_VSX;
@@ -1231,8 +1231,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
                batch->active = 1;
        }
 
-       if (current_thread_info()->task->thread.regs) {
-               restore_math(current_thread_info()->task->thread.regs);
+       if (current->thread.regs) {
+               restore_math(current->thread.regs);
 
                /*
                 * The copy-paste buffer can only store into foreign real
@@ -1242,7 +1242,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
                 * mappings, we must issue a cp_abort to clear any state and
                 * prevent snooping, corruption or a covert channel.
                 */
-               if (current_thread_info()->task->thread.used_vas)
+               if (current->thread.used_vas)
                        asm volatile(PPC_CP_ABORT);
        }
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -1634,7 +1634,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
        unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
        struct thread_info *ti = task_thread_info(p);
 
-       klp_init_thread_info(ti);
+       klp_init_thread_info(p);
 
        /* Copy registers */
        sp -= sizeof(struct pt_regs);
@@ -1691,8 +1691,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
        sp -= STACK_FRAME_OVERHEAD;
        p->thread.ksp = sp;
 #ifdef CONFIG_PPC32
-       p->thread.ksp_limit = (unsigned long)task_stack_page(p) +
-                               _ALIGN_UP(sizeof(struct thread_info), 16);
+       p->thread.ksp_limit = (unsigned long)end_of_stack(p);
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
        p->thread.ptrace_bps[0] = NULL;
@@ -1995,21 +1994,14 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
        unsigned long stack_page;
        unsigned long cpu = task_cpu(p);
 
-       /*
-        * Avoid crashing if the stack has overflowed and corrupted
-        * task_cpu(p), which is in the thread_info struct.
-        */
-       if (cpu < NR_CPUS && cpu_possible(cpu)) {
-               stack_page = (unsigned long) hardirq_ctx[cpu];
-               if (sp >= stack_page + sizeof(struct thread_struct)
-                   && sp <= stack_page + THREAD_SIZE - nbytes)
-                       return 1;
-
-               stack_page = (unsigned long) softirq_ctx[cpu];
-               if (sp >= stack_page + sizeof(struct thread_struct)
-                   && sp <= stack_page + THREAD_SIZE - nbytes)
-                       return 1;
-       }
+       stack_page = (unsigned long)hardirq_ctx[cpu];
+       if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
+               return 1;
+
+       stack_page = (unsigned long)softirq_ctx[cpu];
+       if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
+               return 1;
+
        return 0;
 }
 
@@ -2018,8 +2010,10 @@ int validate_sp(unsigned long sp, struct task_struct *p,
 {
        unsigned long stack_page = (unsigned long)task_stack_page(p);
 
-       if (sp >= stack_page + sizeof(struct thread_struct)
-           && sp <= stack_page + THREAD_SIZE - nbytes)
+       if (sp < THREAD_SIZE)
+               return 0;
+
+       if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
                return 1;
 
        return valid_irq_stack(sp, p, nbytes);
@@ -2027,7 +2021,7 @@ int validate_sp(unsigned long sp, struct task_struct *p,
 
 EXPORT_SYMBOL(validate_sp);
 
-unsigned long get_wchan(struct task_struct *p)
+static unsigned long __get_wchan(struct task_struct *p)
 {
        unsigned long ip, sp;
        int count = 0;
@@ -2053,6 +2047,20 @@ unsigned long get_wchan(struct task_struct *p)
        return 0;
 }
 
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long ret;
+
+       if (!try_get_task_stack(p))
+               return 0;
+
+       ret = __get_wchan(p);
+
+       put_task_stack(p);
+
+       return ret;
+}
+
 static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH;
 
 void show_stack(struct task_struct *tsk, unsigned long *stack)
@@ -2067,9 +2075,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
        int curr_frame = 0;
 #endif
 
-       sp = (unsigned long) stack;
        if (tsk == NULL)
                tsk = current;
+
+       if (!try_get_task_stack(tsk))
+               return;
+
+       sp = (unsigned long) stack;
        if (sp == 0) {
                if (tsk == current)
                        sp = current_stack_pointer();
@@ -2081,7 +2093,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
        printk("Call Trace:\n");
        do {
                if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
-                       return;
+                       break;
 
                stack = (unsigned long *) sp;
                newsp = stack[0];
@@ -2121,6 +2133,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 
                sp = newsp;
        } while (count++ < kstack_depth_to_print);
+
+       put_task_stack(tsk);
 }
 
 #ifdef CONFIG_PPC64
index cdd5d1d3ae412114045704aed483497590063e8e..d9ac7d94656ee1a6c39dc3b4be7edbb5e5594b2c 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/perf_event.h>
 #include <linux/context_tracking.h>
+#include <linux/nospec.h>
 
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
@@ -274,6 +275,8 @@ static int set_user_trap(struct task_struct *task, unsigned long trap)
  */
 int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
 {
+       unsigned int regs_max;
+
        if ((task->thread.regs == NULL) || !data)
                return -EIO;
 
@@ -297,7 +300,9 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
        }
 #endif
 
-       if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) {
+       regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long);
+       if (regno < regs_max) {
+               regno = array_index_nospec(regno, regs_max);
                *data = ((unsigned long *)task->thread.regs)[regno];
                return 0;
        }
@@ -321,6 +326,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
                return set_user_dscr(task, data);
 
        if (regno <= PT_MAX_PUT_REG) {
+               regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1);
                ((unsigned long *)task->thread.regs)[regno] = data;
                return 0;
        }
@@ -561,6 +567,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
                /*
                 * Copy out only the low-order word of vrsave.
                 */
+               int start, end;
                union {
                        elf_vrreg_t reg;
                        u32 word;
@@ -569,8 +576,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
 
                vrsave.word = target->thread.vrsave;
 
+               start = 33 * sizeof(vector128);
+               end = start + sizeof(vrsave);
                ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
-                                         33 * sizeof(vector128), -1);
+                                         start, end);
        }
 
        return ret;
@@ -608,6 +617,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
                /*
                 * We use only the first word of vrsave.
                 */
+               int start, end;
                union {
                        elf_vrreg_t reg;
                        u32 word;
@@ -616,8 +626,10 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
 
                vrsave.word = target->thread.vrsave;
 
+               start = 33 * sizeof(vector128);
+               end = start + sizeof(vrsave);
                ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
-                                        33 * sizeof(vector128), -1);
+                                        start, end);
                if (!ret)
                        target->thread.vrsave = vrsave.word;
        }
index 82be48c123cf9bdad80cace9a69823936c76593b..f17868e19e2c508166df2eeabea94a56bbc0a13f 100644 (file)
@@ -634,7 +634,7 @@ void probe_machine(void)
        }
        /* What can we do if we didn't find ? */
        if (machine_id >= &__machine_desc_end) {
-               DBG("No suitable machine found !\n");
+               pr_err("No suitable machine description found !\n");
                for (;;);
        }
 
@@ -791,7 +791,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev)
 {
        pdev->archdata.dma_mask = DMA_BIT_MASK(32);
        pdev->dev.dma_mask = &pdev->archdata.dma_mask;
-       set_dma_ops(&pdev->dev, &dma_nommu_ops);
 }
 
 static __init void print_system_info(void)
@@ -938,7 +937,7 @@ void __init setup_arch(char **cmdline_p)
        /* Reserve large chunks of memory for use by CMA for KVM. */
        kvm_cma_reserve();
 
-       klp_init_thread_info(&init_thread_info);
+       klp_init_thread_info(&init_task);
 
        init_mm.start_code = (unsigned long)_stext;
        init_mm.end_code = (unsigned long) _etext;
index c31082233a25dcfc70a988acf79d6d68bc8a6173..4a65e08a6042f7b63347a54bc35caab26277729d 100644 (file)
@@ -162,6 +162,17 @@ static int __init ppc_init(void)
 }
 arch_initcall(ppc_init);
 
+static void *__init alloc_stack(void)
+{
+       void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+
+       if (!ptr)
+               panic("cannot allocate %d bytes for stack at %pS\n",
+                     THREAD_SIZE, (void *)_RET_IP_);
+
+       return ptr;
+}
+
 void __init irqstack_early_init(void)
 {
        unsigned int i;
@@ -169,10 +180,8 @@ void __init irqstack_early_init(void)
        /* interrupt stacks must be in lowmem, we get that for free on ppc32
         * as the memblock is limited to lowmem by default */
        for_each_possible_cpu(i) {
-               softirq_ctx[i] = (struct thread_info *)
-                       __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
-               hardirq_ctx[i] = (struct thread_info *)
-                       __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
+               softirq_ctx[i] = alloc_stack();
+               hardirq_ctx[i] = alloc_stack();
        }
 }
 
@@ -190,13 +199,10 @@ void __init exc_lvl_early_init(void)
                hw_cpu = 0;
 #endif
 
-               critirq_ctx[hw_cpu] = (struct thread_info *)
-                       __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
+               critirq_ctx[hw_cpu] = alloc_stack();
 #ifdef CONFIG_BOOKE
-               dbgirq_ctx[hw_cpu] = (struct thread_info *)
-                       __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
-               mcheckirq_ctx[hw_cpu] = (struct thread_info *)
-                       __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
+               dbgirq_ctx[hw_cpu] = alloc_stack();
+               mcheckirq_ctx[hw_cpu] = alloc_stack();
 #endif
        }
 }
index 5de413ae3cd67034d1377732116e4f0ab6198fd6..ff0aac42bb33d6c20e90ee1c0d26d2b759d37870 100644 (file)
@@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void)
 
 static void *__init alloc_stack(unsigned long limit, int cpu)
 {
-       unsigned long pa;
+       void *ptr;
 
        BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16);
 
-       pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
-                                       early_cpu_to_node(cpu), MEMBLOCK_NONE);
-       if (!pa) {
-               pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
-               if (!pa)
-                       panic("cannot allocate stacks");
-       }
+       ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE,
+                                    MEMBLOCK_LOW_LIMIT, limit,
+                                    early_cpu_to_node(cpu));
+       if (!ptr)
+               panic("cannot allocate stacks");
 
-       return __va(pa);
+       return ptr;
 }
 
 void __init irqstack_early_init(void)
@@ -691,24 +689,6 @@ void __init exc_lvl_early_init(void)
 }
 #endif
 
-/*
- * Emergency stacks are used for a range of things, from asynchronous
- * NMIs (system reset, machine check) to synchronous, process context.
- * We set preempt_count to zero, even though that isn't necessarily correct. To
- * get the right value we'd need to copy it from the previous thread_info, but
- * doing that might fault causing more problems.
- * TODO: what to do with accounting?
- */
-static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu)
-{
-       ti->task = NULL;
-       ti->cpu = cpu;
-       ti->preempt_count = 0;
-       ti->local_flags = 0;
-       ti->flags = 0;
-       klp_init_thread_info(ti);
-}
-
 /*
  * Stack space used when we detect a bad kernel stack pointer, and
  * early in SMP boots before relocation is enabled. Exclusive emergency
@@ -736,25 +716,14 @@ void __init emergency_stack_init(void)
        limit = min(ppc64_bolted_size(), ppc64_rma_size);
 
        for_each_possible_cpu(i) {
-               struct thread_info *ti;
-
-               ti = alloc_stack(limit, i);
-               memset(ti, 0, THREAD_SIZE);
-               emerg_stack_init_thread_info(ti, i);
-               paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
 
 #ifdef CONFIG_PPC_BOOK3S_64
                /* emergency stack for NMI exception handling. */
-               ti = alloc_stack(limit, i);
-               memset(ti, 0, THREAD_SIZE);
-               emerg_stack_init_thread_info(ti, i);
-               paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
 
                /* emergency stack for machine check exception handling. */
-               ti = alloc_stack(limit, i);
-               memset(ti, 0, THREAD_SIZE);
-               emerg_stack_init_thread_info(ti, i);
-               paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
+               paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
 #endif
        }
 }
index 3f15edf25a0d69baaf6c0479fa6ce21ec9da590e..e784342bdaa1f8d35a741e6bf62e74c2ed88f2e3 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/sched/topology.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
@@ -75,7 +76,7 @@
 static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 #endif
 
-struct thread_info *secondary_ti;
+struct task_struct *secondary_current;
 bool has_big_cores;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
@@ -358,13 +359,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
  * NMI IPIs may not be recoverable, so should not be used as ongoing part of
  * a running system. They can be used for crash, debug, halt/reboot, etc.
  *
- * NMI IPIs are globally single threaded. No more than one in progress at
- * any time.
- *
  * The IPI call waits with interrupts disabled until all targets enter the
- * NMI handler, then the call returns.
+ * NMI handler, then returns. Subsequent IPIs can be issued before targets
+ * have returned from their handlers, so there is no guarantee about
+ * concurrency or re-entrancy.
  *
- * No new NMI can be initiated until targets exit the handler.
+ * A new NMI can be issued before all targets exit the handler.
  *
  * The IPI call may time out without all targets entering the NMI handler.
  * In that case, there is some logic to recover (and ignore subsequent
@@ -375,7 +375,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 
 static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
 static struct cpumask nmi_ipi_pending_mask;
-static int nmi_ipi_busy_count = 0;
+static bool nmi_ipi_busy = false;
 static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
 
 static void nmi_ipi_lock_start(unsigned long *flags)
@@ -414,7 +414,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags)
  */
 int smp_handle_nmi_ipi(struct pt_regs *regs)
 {
-       void (*fn)(struct pt_regs *);
+       void (*fn)(struct pt_regs *) = NULL;
        unsigned long flags;
        int me = raw_smp_processor_id();
        int ret = 0;
@@ -425,29 +425,17 @@ int smp_handle_nmi_ipi(struct pt_regs *regs)
         * because the caller may have timed out.
         */
        nmi_ipi_lock_start(&flags);
-       if (!nmi_ipi_busy_count)
-               goto out;
-       if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask))
-               goto out;
-
-       fn = nmi_ipi_function;
-       if (!fn)
-               goto out;
-
-       cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
-       nmi_ipi_busy_count++;
-       nmi_ipi_unlock();
-
-       ret = 1;
-
-       fn(regs);
-
-       nmi_ipi_lock();
-       if (nmi_ipi_busy_count > 1) /* Can race with caller time-out */
-               nmi_ipi_busy_count--;
-out:
+       if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) {
+               cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+               fn = READ_ONCE(nmi_ipi_function);
+               WARN_ON_ONCE(!fn);
+               ret = 1;
+       }
        nmi_ipi_unlock_end(&flags);
 
+       if (fn)
+               fn(regs);
+
        return ret;
 }
 
@@ -473,9 +461,10 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe)
  * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
  * - fn is the target callback function.
  * - delay_us > 0 is the delay before giving up waiting for targets to
- *   complete executing the handler, == 0 specifies indefinite delay.
+ *   begin executing the handler, == 0 specifies indefinite delay.
  */
-int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe)
+static int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *),
+                               u64 delay_us, bool safe)
 {
        unsigned long flags;
        int me = raw_smp_processor_id();
@@ -487,31 +476,33 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool
        if (unlikely(!smp_ops))
                return 0;
 
-       /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
        nmi_ipi_lock_start(&flags);
-       while (nmi_ipi_busy_count) {
+       while (nmi_ipi_busy) {
                nmi_ipi_unlock_end(&flags);
-               spin_until_cond(nmi_ipi_busy_count == 0);
+               spin_until_cond(!nmi_ipi_busy);
                nmi_ipi_lock_start(&flags);
        }
-
+       nmi_ipi_busy = true;
        nmi_ipi_function = fn;
 
+       WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask));
+
        if (cpu < 0) {
                /* ALL_OTHERS */
                cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
                cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
        } else {
-               /* cpumask starts clear */
                cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
        }
-       nmi_ipi_busy_count++;
+
        nmi_ipi_unlock();
 
+       /* Interrupts remain hard disabled */
+
        do_smp_send_nmi_ipi(cpu, safe);
 
        nmi_ipi_lock();
-       /* nmi_ipi_busy_count is held here, so unlock/lock is okay */
+       /* nmi_ipi_busy is set here, so unlock/lock is okay */
        while (!cpumask_empty(&nmi_ipi_pending_mask)) {
                nmi_ipi_unlock();
                udelay(1);
@@ -523,29 +514,15 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool
                }
        }
 
-       while (nmi_ipi_busy_count > 1) {
-               nmi_ipi_unlock();
-               udelay(1);
-               nmi_ipi_lock();
-               if (delay_us) {
-                       delay_us--;
-                       if (!delay_us)
-                               break;
-               }
-       }
-
        if (!cpumask_empty(&nmi_ipi_pending_mask)) {
                /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */
                ret = 0;
                cpumask_clear(&nmi_ipi_pending_mask);
        }
-       if (nmi_ipi_busy_count > 1) {
-               /* Timeout waiting for CPUs to execute fn */
-               ret = 0;
-               nmi_ipi_busy_count = 1;
-       }
 
-       nmi_ipi_busy_count--;
+       nmi_ipi_function = NULL;
+       nmi_ipi_busy = false;
+
        nmi_ipi_unlock_end(&flags);
 
        return ret;
@@ -613,17 +590,8 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 static void nmi_stop_this_cpu(struct pt_regs *regs)
 {
        /*
-        * This is a special case because it never returns, so the NMI IPI
-        * handling would never mark it as done, which makes any later
-        * smp_send_nmi_ipi() call spin forever. Mark it done now.
-        *
         * IRQs are already hard disabled by the smp_handle_nmi_ipi.
         */
-       nmi_ipi_lock();
-       if (nmi_ipi_busy_count > 1)
-               nmi_ipi_busy_count--;
-       nmi_ipi_unlock();
-
        spin_begin();
        while (1)
                spin_cpu_relax();
@@ -663,7 +631,7 @@ void smp_send_stop(void)
 }
 #endif /* CONFIG_NMI_IPI */
 
-struct thread_info *current_set[NR_CPUS];
+struct task_struct *current_set[NR_CPUS];
 
 static void smp_store_cpu_info(int id)
 {
@@ -928,7 +896,7 @@ void smp_prepare_boot_cpu(void)
        paca_ptrs[boot_cpuid]->__current = current;
 #endif
        set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
-       current_set[boot_cpuid] = task_thread_info(current);
+       current_set[boot_cpuid] = current;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1013,14 +981,13 @@ static bool secondaries_inhibited(void)
 
 static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
 {
-       struct thread_info *ti = task_thread_info(idle);
-
 #ifdef CONFIG_PPC64
        paca_ptrs[cpu]->__current = idle;
-       paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD;
+       paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) +
+                                THREAD_SIZE - STACK_FRAME_OVERHEAD;
 #endif
-       ti->cpu = cpu;
-       secondary_ti = current_set[cpu] = ti;
+       idle->cpu = cpu;
+       secondary_current = current_set[cpu] = idle;
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
index e2c50b55138f8ab52eecace4c6aad72c382e6bcd..1e2276963f6d324741184af7fbaa88690b137b2e 100644 (file)
@@ -67,12 +67,17 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
        unsigned long sp;
 
+       if (!try_get_task_stack(tsk))
+               return;
+
        if (tsk == current)
                sp = current_stack_pointer();
        else
                sp = tsk->thread.ksp;
 
        save_context_stack(trace, sp, tsk, 0);
+
+       put_task_stack(tsk);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
@@ -84,25 +89,21 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 EXPORT_SYMBOL_GPL(save_stack_trace_regs);
 
 #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
-int
-save_stack_trace_tsk_reliable(struct task_struct *tsk,
-                               struct stack_trace *trace)
+/*
+ * This function returns an error if it detects any unreliable features of the
+ * stack.  Otherwise it guarantees that the stack trace is reliable.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+static int __save_stack_trace_tsk_reliable(struct task_struct *tsk,
+                                          struct stack_trace *trace)
 {
        unsigned long sp;
+       unsigned long newsp;
        unsigned long stack_page = (unsigned long)task_stack_page(tsk);
        unsigned long stack_end;
        int graph_idx = 0;
-
-       /*
-        * The last frame (unwinding first) may not yet have saved
-        * its LR onto the stack.
-        */
-       int firstframe = 1;
-
-       if (tsk == current)
-               sp = current_stack_pointer();
-       else
-               sp = tsk->thread.ksp;
+       bool firstframe;
 
        stack_end = stack_page + THREAD_SIZE;
        if (!is_idle_task(tsk)) {
@@ -129,40 +130,53 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
                stack_end -= STACK_FRAME_OVERHEAD;
        }
 
+       if (tsk == current)
+               sp = current_stack_pointer();
+       else
+               sp = tsk->thread.ksp;
+
        if (sp < stack_page + sizeof(struct thread_struct) ||
            sp > stack_end - STACK_FRAME_MIN_SIZE) {
-               return 1;
+               return -EINVAL;
        }
 
-       for (;;) {
+       for (firstframe = true; sp != stack_end;
+            firstframe = false, sp = newsp) {
                unsigned long *stack = (unsigned long *) sp;
-               unsigned long newsp, ip;
+               unsigned long ip;
 
                /* sanity check: ABI requires SP to be aligned 16 bytes. */
                if (sp & 0xF)
-                       return 1;
-
-               /* Mark stacktraces with exception frames as unreliable. */
-               if (sp <= stack_end - STACK_INT_FRAME_SIZE &&
-                   stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
-                       return 1;
-               }
+                       return -EINVAL;
 
                newsp = stack[0];
                /* Stack grows downwards; unwinder may only go up. */
                if (newsp <= sp)
-                       return 1;
+                       return -EINVAL;
 
                if (newsp != stack_end &&
                    newsp > stack_end - STACK_FRAME_MIN_SIZE) {
-                       return 1; /* invalid backlink, too far up. */
+                       return -EINVAL; /* invalid backlink, too far up. */
+               }
+
+               /*
+                * We can only trust the bottom frame's backlink, the
+                * rest of the frame may be uninitialized, continue to
+                * the next.
+                */
+               if (firstframe)
+                       continue;
+
+               /* Mark stacktraces with exception frames as unreliable. */
+               if (sp <= stack_end - STACK_INT_FRAME_SIZE &&
+                   stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+                       return -EINVAL;
                }
 
                /* Examine the saved LR: it must point into kernel code. */
                ip = stack[STACK_FRAME_LR_SAVE];
-               if (!firstframe && !__kernel_text_address(ip))
-                       return 1;
-               firstframe = 0;
+               if (!__kernel_text_address(ip))
+                       return -EINVAL;
 
                /*
                 * FIXME: IMHO these tests do not belong in
@@ -175,25 +189,37 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
                 * as unreliable.
                 */
                if (ip == (unsigned long)kretprobe_trampoline)
-                       return 1;
+                       return -EINVAL;
 #endif
 
+               if (trace->nr_entries >= trace->max_entries)
+                       return -E2BIG;
                if (!trace->skip)
                        trace->entries[trace->nr_entries++] = ip;
                else
                        trace->skip--;
+       }
+       return 0;
+}
 
-               if (newsp == stack_end)
-                       break;
+int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+                                 struct stack_trace *trace)
+{
+       int ret;
 
-               if (trace->nr_entries >= trace->max_entries)
-                       return -E2BIG;
+       /*
+        * If the task doesn't have a stack (e.g., a zombie), the stack is
+        * "reliably" empty.
+        */
+       if (!try_get_task_stack(tsk))
+               return 0;
 
-               sp = newsp;
-       }
-       return 0;
+       ret = __save_stack_trace_tsk_reliable(tsk, trace);
+
+       put_task_stack(tsk);
+
+       return ret;
 }
-EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable);
 #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
 
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI)
index e6982ab2181663037b210b24f697d8a1bb52e269..e52a8878c2fb020f4b46b32068d797cf14fe6db4 100644 (file)
@@ -123,7 +123,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
                                 (u64)len_high << 32 | len_low, advice);
 }
 
-long sys_switch_endian(void)
+SYSCALL_DEFINE0(switch_endian)
 {
        struct thread_info *ti;
 
index fd620490a542916a867b2bdc05244cb8451caa9e..f7393a7b18aa0cd9ddd58ac176db622378584003 100644 (file)
@@ -13,10 +13,10 @@ emit() {
        t_entry="$3"
 
        while [ $t_nxt -lt $t_nr ]; do
-               printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
+               printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
                t_nxt=$((t_nxt+1))
        done
-       printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
+       printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
 }
 
 grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
index 23265a28740bbcab40c56cc1f5637a13964830dc..02f28faba125d7a7cd080096ab7233f6209ef8c8 100644 (file)
 .globl sys_call_table
 sys_call_table:
 #ifdef CONFIG_PPC64
-#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry)
+#define __SYSCALL(nr, entry)   .8byte DOTSYM(entry)
 #include <asm/syscall_table_64.h>
 #undef __SYSCALL
 #else
-#define __SYSCALL(nr, entry, nargs) .long entry
+#define __SYSCALL(nr, entry)   .long entry
 #include <asm/syscall_table_32.h>
 #undef __SYSCALL
 #endif
@@ -38,7 +38,7 @@ sys_call_table:
 .globl compat_sys_call_table
 compat_sys_call_table:
 #define compat_sys_sigsuspend  sys_sigsuspend
-#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry)
+#define __SYSCALL(nr, entry)   .8byte DOTSYM(entry)
 #include <asm/syscall_table_c32.h>
 #undef __SYSCALL
 #endif
index 3646affae963fec23ae64a26350adce79c5a4879..bc0503ef9c9ceb74d0dd38a6c00650f87bdec420 100644 (file)
@@ -57,7 +57,6 @@
 #include <linux/irq_work.h>
 #include <linux/clk-provider.h>
 #include <linux/suspend.h>
-#include <linux/rtc.h>
 #include <linux/sched/cputime.h>
 #include <linux/processor.h>
 #include <asm/trace.h>
index b1725ad3e13d4978f3b0317a2b762d49b4dd08a8..858503775c58338f799215caa1e8099ddc526ec5 100644 (file)
@@ -23,6 +23,7 @@ obj-$(CONFIG_TRACING)                 += trace_clock.o
 obj-$(CONFIG_PPC64)                    += $(obj64-y)
 obj-$(CONFIG_PPC32)                    += $(obj32-y)
 
-# Disable GCOV & sanitizers in odd or sensitive code
+# Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_ftrace.o := n
+KCOV_INSTRUMENT_ftrace.o := n
 UBSAN_SANITIZE_ftrace.o := n
index 32476a6e4e9cea8e8a09bb8da4b80cbd8af5fef8..01b1224add4987b7707adfcbe8de53ce93911403 100644 (file)
@@ -229,7 +229,7 @@ ftrace_call:
         *  - r0, r11 & r12 are free
         */
 livepatch_handler:
-       CURRENT_THREAD_INFO(r12, r1)
+       ld      r12, PACA_THREAD_INFO(r13)
 
        /* Allocate 3 x 8 bytes */
        ld      r11, TI_livepatch_sp(r12)
@@ -256,7 +256,7 @@ livepatch_handler:
         * restore it.
         */
 
-       CURRENT_THREAD_INFO(r12, r1)
+       ld      r12, PACA_THREAD_INFO(r13)
 
        ld      r11, TI_livepatch_sp(r12)
 
@@ -273,7 +273,7 @@ livepatch_handler:
        ld      r2,  -24(r11)
 
        /* Pop livepatch stack frame */
-       CURRENT_THREAD_INFO(r12, r1)
+       ld      r12, PACA_THREAD_INFO(r13)
        subi    r11, r11, 24
        std     r11, TI_livepatch_sp(r12)
 
index 64936b60d5216e185670ebbb3424be395fe740d9..a21200c6aaeaaf99dfe6ee1d4190354189f46fb2 100644 (file)
@@ -257,24 +257,17 @@ static int __die(const char *str, struct pt_regs *regs, long err)
 {
        printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
 
-       if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
-               printk("LE ");
-       else
-               printk("BE ");
-
-       if (IS_ENABLED(CONFIG_PREEMPT))
-               pr_cont("PREEMPT ");
-
-       if (IS_ENABLED(CONFIG_SMP))
-               pr_cont("SMP NR_CPUS=%d ", NR_CPUS);
-
-       if (debug_pagealloc_enabled())
-               pr_cont("DEBUG_PAGEALLOC ");
-
-       if (IS_ENABLED(CONFIG_NUMA))
-               pr_cont("NUMA ");
-
-       pr_cont("%s\n", ppc_md.name ? ppc_md.name : "");
+       printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n",
+              IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
+              PAGE_SIZE / 1024,
+              early_radix_enabled() ? " MMU=Radix" : "",
+              early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "",
+              IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+              IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+              IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
+              debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+              IS_ENABLED(CONFIG_NUMA) ? " NUMA" : "",
+              ppc_md.name ? ppc_md.name : "");
 
        if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP)
                return 1;
@@ -376,16 +369,101 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
        force_sig_fault(signr, code, (void __user *)addr, current);
 }
 
+/*
+ * The interrupt architecture has a quirk in that the HV interrupts excluding
+ * the NMIs (0x100 and 0x200) do not clear MSR[RI] at entry. The first thing
+ * that an interrupt handler must do is save off a GPR into a scratch register,
+ * and all interrupts on POWERNV (HV=1) use the HSPRG1 register as scratch.
+ * Therefore an NMI can clobber an HV interrupt's live HSPRG1 without noticing
+ * that it is non-reentrant, which leads to random data corruption.
+ *
+ * The solution is for NMI interrupts in HV mode to check if they originated
+ * from these critical HV interrupt regions. If so, then mark them not
+ * recoverable.
+ *
+ * An alternative would be for HV NMIs to use SPRG for scratch to avoid the
+ * HSPRG1 clobber, however this would cause guest SPRG to be clobbered. Linux
+ * guests should always have MSR[RI]=0 when its scratch SPRG is in use, so
+ * that would work. However any other guest OS that may have the SPRG live
+ * and MSR[RI]=1 could encounter silent corruption.
+ *
+ * Builds that do not support KVM could take this second option to increase
+ * the recoverability of NMIs.
+ */
+void hv_nmi_check_nonrecoverable(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_POWERNV
+       unsigned long kbase = (unsigned long)_stext;
+       unsigned long nip = regs->nip;
+
+       if (!(regs->msr & MSR_RI))
+               return;
+       if (!(regs->msr & MSR_HV))
+               return;
+       if (regs->msr & MSR_PR)
+               return;
+
+       /*
+        * Now test if the interrupt has hit a range that may be using
+        * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The
+        * problem ranges all run un-relocated. Test real and virt modes
+        * at the same time by droping the high bit of the nip (virt mode
+        * entry points still have the +0x4000 offset).
+        */
+       nip &= ~0xc000000000000000ULL;
+       if ((nip >= 0x500 && nip < 0x600) || (nip >= 0x4500 && nip < 0x4600))
+               goto nonrecoverable;
+       if ((nip >= 0x980 && nip < 0xa00) || (nip >= 0x4980 && nip < 0x4a00))
+               goto nonrecoverable;
+       if ((nip >= 0xe00 && nip < 0xec0) || (nip >= 0x4e00 && nip < 0x4ec0))
+               goto nonrecoverable;
+       if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0))
+               goto nonrecoverable;
+
+       /* Trampoline code runs un-relocated so subtract kbase. */
+       if (nip >= (unsigned long)(start_real_trampolines - kbase) &&
+                       nip < (unsigned long)(end_real_trampolines - kbase))
+               goto nonrecoverable;
+       if (nip >= (unsigned long)(start_virt_trampolines - kbase) &&
+                       nip < (unsigned long)(end_virt_trampolines - kbase))
+               goto nonrecoverable;
+       return;
+
+nonrecoverable:
+       regs->msr &= ~MSR_RI;
+#endif
+}
+
 void system_reset_exception(struct pt_regs *regs)
 {
+       unsigned long hsrr0, hsrr1;
+       bool nested = in_nmi();
+       bool saved_hsrrs = false;
+
        /*
         * Avoid crashes in case of nested NMI exceptions. Recoverability
         * is determined by RI and in_nmi
         */
-       bool nested = in_nmi();
        if (!nested)
                nmi_enter();
 
+       /*
+        * System reset can interrupt code where HSRRs are live and MSR[RI]=1.
+        * The system reset interrupt itself may clobber HSRRs (e.g., to call
+        * OPAL), so save them here and restore them before returning.
+        *
+        * Machine checks don't need to save HSRRs, as the real mode handler
+        * is careful to avoid them, and the regular handler is not delivered
+        * as an NMI.
+        */
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               hsrr0 = mfspr(SPRN_HSRR0);
+               hsrr1 = mfspr(SPRN_HSRR1);
+               saved_hsrrs = true;
+       }
+
+       hv_nmi_check_nonrecoverable(regs);
+
        __this_cpu_inc(irq_stat.sreset_irqs);
 
        /* See if any machine dependent calls */
@@ -433,6 +511,11 @@ out:
        if (!(regs->msr & MSR_RI))
                nmi_panic(regs, "Unrecoverable System Reset");
 
+       if (saved_hsrrs) {
+               mtspr(SPRN_HSRR0, hsrr0);
+               mtspr(SPRN_HSRR1, hsrr1);
+       }
+
        if (!nested)
                nmi_exit();
 
@@ -763,15 +846,15 @@ void machine_check_exception(struct pt_regs *regs)
        if (check_io_access(regs))
                goto bail;
 
-       /* Must die if the interrupt is not recoverable */
-       if (!(regs->msr & MSR_RI))
-               nmi_panic(regs, "Unrecoverable Machine check");
-
        if (!nested)
                nmi_exit();
 
        die("Machine check", regs, SIGBUS);
 
+       /* Must die if the interrupt is not recoverable */
+       if (!(regs->msr & MSR_RI))
+               nmi_panic(regs, "Unrecoverable Machine check");
+
        return;
 
 bail:
@@ -1542,8 +1625,8 @@ bail:
 
 void StackOverflow(struct pt_regs *regs)
 {
-       printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n",
-              current, regs->gpr[1]);
+       pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n",
+               current->comm, task_pid_nr(current), regs->gpr[1]);
        debugger(regs);
        show_regs(regs);
        panic("kernel stack overflow");
index 7cc38b5b58bce600d578a4e500a400b510572589..8db4891acdafebcd2b8f85c57f53c73627850017 100644 (file)
@@ -74,7 +74,7 @@ void __init udbg_early_init(void)
 #endif
 
 #ifdef CONFIG_PPC_EARLY_DEBUG
-       console_loglevel = 10;
+       console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
 
        register_early_udbg_console();
 #endif
index 50112d4473bb9a9c3c6f1302f4a400d55f79db74..ce199f6e4256d2593d54d43e1652a1e6bb051d98 100644 (file)
@@ -23,6 +23,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
 obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
 
 GCOV_PROFILE := n
+KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 
 ccflags-y := -shared -fno-common -fno-builtin
index 69cecb3462697a88107d8e966e2775fafe81628a..28e7d112aa2fc4a7ab268109df01a0b416d25a9b 100644 (file)
@@ -9,6 +9,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
 obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
 
 GCOV_PROFILE := n
+KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 
 ccflags-y := -shared -fno-common -fno-builtin
index ad1c77f71f5474e91219c36fa827b5079b60be62..060a1acd7c6d77de65de007ede891d4486bdf521 100644 (file)
 #include <asm/cache.h>
 #include <asm/thread_info.h>
 
-#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32)
-#define STRICT_ALIGN_SIZE      (1 << 24)
-#else
-#define STRICT_ALIGN_SIZE      PAGE_SIZE
-#endif
+#define STRICT_ALIGN_SIZE      (1 << CONFIG_DATA_SHIFT)
+#define ETEXT_ALIGN_SIZE       (1 << CONFIG_ETEXT_SHIFT)
 
 ENTRY(_stext)
 
@@ -86,11 +83,11 @@ SECTIONS
 
 #ifdef CONFIG_PPC64
        /*
-        * BLOCK(0) overrides the default output section alignment because
+        * ALIGN(0) overrides the default output section alignment because
         * this needs to start right after .head.text in order for fixed
         * section placement to work.
         */
-       .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) {
+       .text ALIGN(0) : AT(ADDR(.text) - LOAD_OFFSET) {
 #ifdef CONFIG_LD_HEAD_STUB_CATCH
                KEEP(*(.linker_stub_catch));
                . = . ;
@@ -131,7 +128,7 @@ SECTIONS
 
        } :kernel
 
-       . = ALIGN(PAGE_SIZE);
+       . = ALIGN(ETEXT_ALIGN_SIZE);
        _etext = .;
        PROVIDE32 (etext = .);
 
@@ -319,6 +316,7 @@ SECTIONS
                *(.sdata2)
                *(.got.plt) *(.got)
                *(.plt)
+               *(.branch_lt)
        }
 #else
        .data : AT(ADDR(.data) - LOAD_OFFSET) {
index 64f1135e77323b180cfdb34e3d42a01cde13803c..3223aec88b2cc314c0f0b0485c1b9049f278d3d7 100644 (file)
@@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
 common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
 common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 
-CFLAGS_e500_mmu.o := -I.
-CFLAGS_e500_mmu_host.o := -I.
-CFLAGS_emulate.o  := -I.
-CFLAGS_emulate_loadstore.o  := -I.
-
 common-objs-y += powerpc.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
index bd1a677dd9e4dacf9f25ee6b17bb23416836d4dc..9a7dadbe1f1733a8f7cf60a08363b447bea1bf51 100644 (file)
@@ -192,6 +192,13 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
 }
 EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
 
+void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags)
+{
+       /* might as well deliver this straight away */
+       kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
+
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
        /* might as well deliver this straight away */
index 5a066fc299e17189bd587ad20ddba86b956c6614..a3d5318f5d1e9a9e2654525cfa65059a232982b8 100644 (file)
@@ -1215,6 +1215,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                r = RESUME_GUEST;
                break;
        case BOOK3S_INTERRUPT_MACHINE_CHECK:
+               /* Print the MCE event to host console. */
+               machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
+
+               /*
+                * If the guest can do FWNMI, exit to userspace so it can
+                * deliver a FWNMI to the guest.
+                * Otherwise we synthesize a machine check for the guest
+                * so that it knows that the machine check occurred.
+                */
+               if (!vcpu->kvm->arch.fwnmi_enabled) {
+                       ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
+                       kvmppc_core_queue_machine_check(vcpu, flags);
+                       r = RESUME_GUEST;
+                       break;
+               }
+
                /* Exit to guest with KVM_EXIT_NMI as exit reason */
                run->exit_reason = KVM_EXIT_NMI;
                run->hw.hardware_exit_reason = vcpu->arch.trap;
@@ -1227,8 +1243,6 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
 
                r = RESUME_HOST;
-               /* Print the MCE event to host console. */
-               machine_check_print_event_info(&vcpu->arch.mce_evt, false);
                break;
        case BOOK3S_INTERRUPT_PROGRAM:
        {
@@ -1392,7 +1406,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                /* Pass the machine check to the L1 guest */
                r = RESUME_HOST;
                /* Print the MCE event to host console. */
-               machine_check_print_event_info(&vcpu->arch.mce_evt, false);
+               machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
                break;
        /*
         * We get these next two if the guest accesses a page which it thinks
@@ -3455,6 +3469,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        unsigned long host_dscr = mfspr(SPRN_DSCR);
        unsigned long host_tidr = mfspr(SPRN_TIDR);
        unsigned long host_iamr = mfspr(SPRN_IAMR);
+       unsigned long host_amr = mfspr(SPRN_AMR);
        s64 dec;
        u64 tb;
        int trap, save_pmu;
@@ -3571,13 +3586,15 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 
        mtspr(SPRN_PSPB, 0);
        mtspr(SPRN_WORT, 0);
-       mtspr(SPRN_AMR, 0);
        mtspr(SPRN_UAMOR, 0);
        mtspr(SPRN_DSCR, host_dscr);
        mtspr(SPRN_TIDR, host_tidr);
        mtspr(SPRN_IAMR, host_iamr);
        mtspr(SPRN_PSPB, 0);
 
+       if (host_amr != vcpu->arch.amr)
+               mtspr(SPRN_AMR, host_amr);
+
        msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
        store_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
index e3f738eb1cacc73d97ccaf4d10c770716ae469b8..64b5011475c78a1fa6146a697b189040e3b88b29 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <asm/paca.h>
 #include <asm/hmi.h>
+#include <asm/processor.h>
 
 void wait_for_subcore_guest_exit(void)
 {
index 0787f12c1a1bef3e22e10ed3aafe25f842a2b838..8c24c3bea0bfcd5f0e2041542d042c84fc62afdf 100644 (file)
@@ -66,10 +66,8 @@ static void reload_slb(struct kvm_vcpu *vcpu)
 /*
  * On POWER7, see if we can handle a machine check that occurred inside
  * the guest in real mode, without switching to the host partition.
- *
- * Returns: 0 => exit guest, 1 => deliver machine check to guest
  */
-static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
 {
        unsigned long srr1 = vcpu->arch.shregs.msr;
        struct machine_check_event mce_evt;
@@ -111,52 +109,24 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
        }
 
        /*
-        * See if we have already handled the condition in the linux host.
-        * We assume that if the condition is recovered then linux host
-        * will have generated an error log event that we will pick
-        * up and log later.
-        * Don't release mce event now. We will queue up the event so that
-        * we can log the MCE event info on host console.
+        * Now get the event and stash it in the vcpu struct so it can
+        * be handled by the primary thread in virtual mode.  We can't
+        * call machine_check_queue_event() here if we are running on
+        * an offline secondary thread.
         */
-       if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE))
-               goto out;
-
-       if (mce_evt.version == MCE_V1 &&
-           (mce_evt.severity == MCE_SEV_NO_ERROR ||
-            mce_evt.disposition == MCE_DISPOSITION_RECOVERED))
-               handled = 1;
-
-out:
-       /*
-        * For guest that supports FWNMI capability, hook the MCE event into
-        * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
-        * exit reason. On our way to exit we will pull this event from vcpu
-        * structure and print it from thread 0 of the core/subcore.
-        *
-        * For guest that does not support FWNMI capability (old QEMU):
-        * We are now going enter guest either through machine check
-        * interrupt (for unhandled errors) or will continue from
-        * current HSRR0 (for handled errors) in guest. Hence
-        * queue up the event so that we can log it from host console later.
-        */
-       if (vcpu->kvm->arch.fwnmi_enabled) {
-               /*
-                * Hook up the mce event on to vcpu structure.
-                * First clear the old event.
-                */
-               memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
-               if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
-                       vcpu->arch.mce_evt = mce_evt;
-               }
-       } else
-               machine_check_queue_event();
+       if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+               if (handled && mce_evt.version == MCE_V1)
+                       mce_evt.disposition = MCE_DISPOSITION_RECOVERED;
+       } else {
+               memset(&mce_evt, 0, sizeof(mce_evt));
+       }
 
-       return handled;
+       vcpu->arch.mce_evt = mce_evt;
 }
 
-long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
 {
-       return kvmppc_realmode_mc_power7(vcpu);
+       kvmppc_realmode_mc_power7(vcpu);
 }
 
 /* Check if dynamic split is in force and return subcore size accordingly. */
index 9b8d50a7cbaf6428a654e811c31754d3770d9f66..25043b50cb30a4b7d5dcde8e45ba61bc3b3e547f 100644 (file)
@@ -58,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_DAWR                (SFS-56)
 #define STACK_SLOT_DAWRX       (SFS-64)
 #define STACK_SLOT_HFSCR       (SFS-72)
+#define STACK_SLOT_AMR         (SFS-80)
+#define STACK_SLOT_UAMOR       (SFS-88)
 /* the following is used by the P9 short path */
 #define STACK_SLOT_NVGPRS      (SFS-152)       /* 18 gprs */
 
@@ -726,11 +728,9 @@ BEGIN_FTR_SECTION
        mfspr   r5, SPRN_TIDR
        mfspr   r6, SPRN_PSSCR
        mfspr   r7, SPRN_PID
-       mfspr   r8, SPRN_IAMR
        std     r5, STACK_SLOT_TID(r1)
        std     r6, STACK_SLOT_PSSCR(r1)
        std     r7, STACK_SLOT_PID(r1)
-       std     r8, STACK_SLOT_IAMR(r1)
        mfspr   r5, SPRN_HFSCR
        std     r5, STACK_SLOT_HFSCR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
@@ -738,11 +738,18 @@ BEGIN_FTR_SECTION
        mfspr   r5, SPRN_CIABR
        mfspr   r6, SPRN_DAWR
        mfspr   r7, SPRN_DAWRX
+       mfspr   r8, SPRN_IAMR
        std     r5, STACK_SLOT_CIABR(r1)
        std     r6, STACK_SLOT_DAWR(r1)
        std     r7, STACK_SLOT_DAWRX(r1)
+       std     r8, STACK_SLOT_IAMR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+       mfspr   r5, SPRN_AMR
+       std     r5, STACK_SLOT_AMR(r1)
+       mfspr   r6, SPRN_UAMOR
+       std     r6, STACK_SLOT_UAMOR(r1)
+
 BEGIN_FTR_SECTION
        /* Set partition DABR */
        /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -1631,22 +1638,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
        mtspr   SPRN_PSPB, r0
        mtspr   SPRN_WORT, r0
 BEGIN_FTR_SECTION
-       mtspr   SPRN_IAMR, r0
        mtspr   SPRN_TCSCR, r0
        /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
        li      r0, 1
        sldi    r0, r0, 31
        mtspr   SPRN_MMCRS, r0
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-8:
 
-       /* Save and reset AMR and UAMOR before turning on the MMU */
+       /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
+       ld      r8, STACK_SLOT_IAMR(r1)
+       mtspr   SPRN_IAMR, r8
+
+8:     /* Power7 jumps back in here */
        mfspr   r5,SPRN_AMR
        mfspr   r6,SPRN_UAMOR
        std     r5,VCPU_AMR(r9)
        std     r6,VCPU_UAMOR(r9)
-       li      r6,0
-       mtspr   SPRN_AMR,r6
+       ld      r5,STACK_SLOT_AMR(r1)
+       ld      r6,STACK_SLOT_UAMOR(r1)
+       mtspr   SPRN_AMR, r5
        mtspr   SPRN_UAMOR, r6
 
        /* Switch DSCR back to host value */
@@ -1746,11 +1756,9 @@ BEGIN_FTR_SECTION
        ld      r5, STACK_SLOT_TID(r1)
        ld      r6, STACK_SLOT_PSSCR(r1)
        ld      r7, STACK_SLOT_PID(r1)
-       ld      r8, STACK_SLOT_IAMR(r1)
        mtspr   SPRN_TIDR, r5
        mtspr   SPRN_PSSCR, r6
        mtspr   SPRN_PID, r7
-       mtspr   SPRN_IAMR, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 #ifdef CONFIG_PPC_RADIX_MMU
@@ -2826,49 +2834,15 @@ kvm_cede_exit:
 #endif /* CONFIG_KVM_XICS */
 3:     b       guest_exit_cont
 
-       /* Try to handle a machine check in real mode */
+       /* Try to do machine check recovery in real mode */
 machine_check_realmode:
        mr      r3, r9          /* get vcpu pointer */
        bl      kvmppc_realmode_machine_check
        nop
+       /* all machine checks go to virtual mode for further handling */
        ld      r9, HSTATE_KVM_VCPU(r13)
        li      r12, BOOK3S_INTERRUPT_MACHINE_CHECK
-       /*
-        * For the guest that is FWNMI capable, deliver all the MCE errors
-        * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
-        * reason. This new approach injects machine check errors in guest
-        * address space to guest with additional information in the form
-        * of RTAS event, thus enabling guest kernel to suitably handle
-        * such errors.
-        *
-        * For the guest that is not FWNMI capable (old QEMU) fallback
-        * to old behaviour for backward compatibility:
-        * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
-        * through machine check interrupt (set HSRR0 to 0x200).
-        * For handled errors (no-fatal), just go back to guest execution
-        * with current HSRR0.
-        * if we receive machine check with MSR(RI=0) then deliver it to
-        * guest as machine check causing guest to crash.
-        */
-       ld      r11, VCPU_MSR(r9)
-       rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
-       bne     guest_exit_cont         /* if so, exit to host */
-       /* Check if guest is capable of handling NMI exit */
-       ld      r10, VCPU_KVM(r9)
-       lbz     r10, KVM_FWNMI(r10)
-       cmpdi   r10, 1                  /* FWNMI capable? */
-       beq     guest_exit_cont         /* if so, exit with KVM_EXIT_NMI. */
-
-       /* if not, fall through for backward compatibility. */
-       andi.   r10, r11, MSR_RI        /* check for unrecoverable exception */
-       beq     1f                      /* Deliver a machine check to guest */
-       ld      r10, VCPU_PC(r9)
-       cmpdi   r3, 0           /* Did we handle MCE ? */
-       bne     2f      /* Continue guest execution. */
-       /* If not, deliver a machine check.  SRR0/1 are already set */
-1:     li      r10, BOOK3S_INTERRUPT_MACHINE_CHECK
-       bl      kvmppc_msr_interrupt
-2:     b       fast_interrupt_c_return
+       b       guest_exit_cont
 
 /*
  * Call C code to handle a HMI in real mode.
index 3bf9fc6fd36c30ebeae45997f69b50a250aec8aa..79396e184bcaaac71f120e0a88058bc7a3d2c182 100644 (file)
@@ -30,7 +30,8 @@ obj64-y       += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
 
 obj64-$(CONFIG_SMP)    += locks.o
 obj64-$(CONFIG_ALTIVEC)        += vmx-helper.o
-obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
+obj64-$(CONFIG_KPROBES_SANITY_TEST)    += test_emulate_step.o \
+                                          test_emulate_step_exec_instr.o
 
 obj-y                  += checksum_$(BITS).o checksum_wrappers.o \
                           string_$(BITS).o memcmp_$(BITS).o
index d81568f783e5c7fe400719a84d6de082d4c21855..3d33fb509ef44e880e97d27a685708de133809f5 100644 (file)
@@ -1169,7 +1169,7 @@ static nokprobe_inline int trap_compare(long v1, long v2)
 int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                  unsigned int instr)
 {
-       unsigned int opcode, ra, rb, rd, spr, u;
+       unsigned int opcode, ra, rb, rc, rd, spr, u;
        unsigned long int imm;
        unsigned long int val, val2;
        unsigned int mb, me, sh;
@@ -1292,6 +1292,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
        rd = (instr >> 21) & 0x1f;
        ra = (instr >> 16) & 0x1f;
        rb = (instr >> 11) & 0x1f;
+       rc = (instr >> 6) & 0x1f;
 
        switch (opcode) {
 #ifdef __powerpc64__
@@ -1305,6 +1306,38 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                        goto trap;
                return 1;
 
+#ifdef __powerpc64__
+       case 4:
+               if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                       return -1;
+
+               switch (instr & 0x3f) {
+               case 48:        /* maddhd */
+                       asm volatile(PPC_MADDHD(%0, %1, %2, %3) :
+                                    "=r" (op->val) : "r" (regs->gpr[ra]),
+                                    "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
+                       goto compute_done;
+
+               case 49:        /* maddhdu */
+                       asm volatile(PPC_MADDHDU(%0, %1, %2, %3) :
+                                    "=r" (op->val) : "r" (regs->gpr[ra]),
+                                    "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
+                       goto compute_done;
+
+               case 51:        /* maddld */
+                       asm volatile(PPC_MADDLD(%0, %1, %2, %3) :
+                                    "=r" (op->val) : "r" (regs->gpr[ra]),
+                                    "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
+                       goto compute_done;
+               }
+
+               /*
+                * There are other instructions from ISA 3.0 with the same
+                * primary opcode which do not have emulation support yet.
+                */
+               return -1;
+#endif
+
        case 7:         /* mulli */
                op->val = regs->gpr[ra] * (short) instr;
                goto compute_done;
@@ -1671,10 +1704,23 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                                (int) regs->gpr[rb];
 
                        goto arith_done;
-
+#ifdef __powerpc64__
+               case 265:       /* modud */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       op->val = regs->gpr[ra] % regs->gpr[rb];
+                       goto compute_done;
+#endif
                case 266:       /* add */
                        op->val = regs->gpr[ra] + regs->gpr[rb];
                        goto arith_done;
+
+               case 267:       /* moduw */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       op->val = (unsigned int) regs->gpr[ra] %
+                               (unsigned int) regs->gpr[rb];
+                       goto compute_done;
 #ifdef __powerpc64__
                case 457:       /* divdu */
                        op->val = regs->gpr[ra] / regs->gpr[rb];
@@ -1695,6 +1741,42 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                                (int) regs->gpr[rb];
                        goto arith_done;
 
+               case 755:       /* darn */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       switch (ra & 0x3) {
+                       case 0:
+                               /* 32-bit conditioned */
+                               asm volatile(PPC_DARN(%0, 0) : "=r" (op->val));
+                               goto compute_done;
+
+                       case 1:
+                               /* 64-bit conditioned */
+                               asm volatile(PPC_DARN(%0, 1) : "=r" (op->val));
+                               goto compute_done;
+
+                       case 2:
+                               /* 64-bit raw */
+                               asm volatile(PPC_DARN(%0, 2) : "=r" (op->val));
+                               goto compute_done;
+                       }
+
+                       return -1;
+#ifdef __powerpc64__
+               case 777:       /* modsd */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       op->val = (long int) regs->gpr[ra] %
+                               (long int) regs->gpr[rb];
+                       goto compute_done;
+#endif
+               case 779:       /* modsw */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       op->val = (int) regs->gpr[ra] %
+                               (int) regs->gpr[rb];
+                       goto compute_done;
+
 
 /*
  * Logical instructions
@@ -1764,6 +1846,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                case 506:       /* popcntd */
                        do_popcnt(regs, op, regs->gpr[rd], 64);
                        goto logical_done_nocc;
+#endif
+               case 538:       /* cnttzw */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       val = (unsigned int) regs->gpr[rd];
+                       op->val = (val ? __builtin_ctz(val) : 32);
+                       goto logical_done;
+#ifdef __powerpc64__
+               case 570:       /* cnttzd */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       val = regs->gpr[rd];
+                       op->val = (val ? __builtin_ctzl(val) : 64);
+                       goto logical_done;
 #endif
                case 922:       /* extsh */
                        op->val = (signed short) regs->gpr[rd];
@@ -1866,6 +1962,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
                                op->xerval &= ~XER_CA;
                        set_ca32(op, op->xerval & XER_CA);
                        goto logical_done;
+
+               case 890:       /* extswsli with sh_5 = 0 */
+               case 891:       /* extswsli with sh_5 = 1 */
+                       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                               return -1;
+                       op->type = COMPUTE + SETREG;
+                       sh = rb | ((instr & 2) << 4);
+                       val = (signed int) regs->gpr[rd];
+                       if (sh)
+                               op->val = ROTATE(val, sh) & MASK64(0, 63 - sh);
+                       else
+                               op->val = val;
+                       goto logical_done;
+
 #endif /* __powerpc64__ */
 
 /*
index 6c47daa616149f745bf96702157ed9a38405437a..9992c1ea7a1d6e2c1595ffbdd7b7ea69bd484a5b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Simple sanity test for emulate_step load/store instructions.
+ * Simple sanity tests for instruction emulation infrastructure.
  *
  * Copyright IBM Corp. 2016
  *
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <asm/sstep.h>
 #include <asm/ppc-opcode.h>
+#include <asm/code-patching.h>
 
 #define IMM_L(i)               ((uintptr_t)(i) & 0xffff)
 
                                        ___PPC_RA(a) | ___PPC_RB(b))
 #define TEST_LXVD2X(s, a, b)   (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b))
 #define TEST_STXVD2X(s, a, b)  (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b))
+#define TEST_ADD(t, a, b)      (PPC_INST_ADD | ___PPC_RT(t) |          \
+                                       ___PPC_RA(a) | ___PPC_RB(b))
+#define TEST_ADD_DOT(t, a, b)  (PPC_INST_ADD | ___PPC_RT(t) |          \
+                                       ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+#define TEST_ADDC(t, a, b)     (PPC_INST_ADDC | ___PPC_RT(t) |         \
+                                       ___PPC_RA(a) | ___PPC_RB(b))
+#define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) |         \
+                                       ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+
+#define MAX_SUBTESTS   16
 
+#define IGNORE_GPR(n)  (0x1UL << (n))
+#define IGNORE_XER     (0x1UL << 32)
+#define IGNORE_CCR     (0x1UL << 33)
 
 static void __init init_pt_regs(struct pt_regs *regs)
 {
@@ -72,9 +86,15 @@ static void __init init_pt_regs(struct pt_regs *regs)
        msr_cached = true;
 }
 
-static void __init show_result(char *ins, char *result)
+static void __init show_result(char *mnemonic, char *result)
 {
-       pr_info("%-14s : %s\n", ins, result);
+       pr_info("%-14s : %s\n", mnemonic, result);
+}
+
+static void __init show_result_with_descr(char *mnemonic, char *descr,
+                                         char *result)
+{
+       pr_info("%-14s : %-50s %s\n", mnemonic, descr, result);
 }
 
 static void __init test_ld(void)
@@ -426,7 +446,7 @@ static void __init test_lxvd2x_stxvd2x(void)
 }
 #endif /* CONFIG_VSX */
 
-static int __init test_emulate_step(void)
+static void __init run_tests_load_store(void)
 {
        test_ld();
        test_lwz();
@@ -437,6 +457,513 @@ static int __init test_emulate_step(void)
        test_lfdx_stfdx();
        test_lvx_stvx();
        test_lxvd2x_stxvd2x();
+}
+
+struct compute_test {
+       char *mnemonic;
+       struct {
+               char *descr;
+               unsigned long flags;
+               unsigned int instr;
+               struct pt_regs regs;
+       } subtests[MAX_SUBTESTS + 1];
+};
+
+static struct compute_test compute_tests[] = {
+       {
+               .mnemonic = "nop",
+               .subtests = {
+                       {
+                               .descr = "R0 = LONG_MAX",
+                               .instr = PPC_INST_NOP,
+                               .regs = {
+                                       .gpr[0] = LONG_MAX,
+                               }
+                       }
+               }
+       },
+       {
+               .mnemonic = "add",
+               .subtests = {
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MIN",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MAX, RB = LONG_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MAX,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = ULONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = 0x1",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MIN",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MAX, RB = INT_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MAX,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = UINT_MAX",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = UINT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = 0x1",
+                               .instr = TEST_ADD(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       }
+               }
+       },
+       {
+               .mnemonic = "add.",
+               .subtests = {
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MIN",
+                               .flags = IGNORE_CCR,
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MAX",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MAX, RB = LONG_MAX",
+                               .flags = IGNORE_CCR,
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MAX,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = ULONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = 0x1",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MIN",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MAX",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MAX, RB = INT_MAX",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MAX,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = UINT_MAX",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = UINT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = 0x1",
+                               .instr = TEST_ADD_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       }
+               }
+       },
+       {
+               .mnemonic = "addc",
+               .subtests = {
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MIN",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MAX, RB = LONG_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MAX,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = ULONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = 0x1",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MIN",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MAX, RB = INT_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MAX,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = UINT_MAX",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = UINT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = 0x1",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN",
+                               .instr = TEST_ADDC(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN | (uint)INT_MIN,
+                                       .gpr[22] = LONG_MIN | (uint)INT_MIN,
+                               }
+                       }
+               }
+       },
+       {
+               .mnemonic = "addc.",
+               .subtests = {
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MIN",
+                               .flags = IGNORE_CCR,
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN, RB = LONG_MAX",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MAX, RB = LONG_MAX",
+                               .flags = IGNORE_CCR,
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MAX,
+                                       .gpr[22] = LONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = ULONG_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = ULONG_MAX, RB = 0x1",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = ULONG_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MIN",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MIN,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MIN, RB = INT_MAX",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MIN,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = INT_MAX, RB = INT_MAX",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = INT_MAX,
+                                       .gpr[22] = INT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = UINT_MAX",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = UINT_MAX,
+                               }
+                       },
+                       {
+                               .descr = "RA = UINT_MAX, RB = 0x1",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = UINT_MAX,
+                                       .gpr[22] = 0x1,
+                               }
+                       },
+                       {
+                               .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN",
+                               .instr = TEST_ADDC_DOT(20, 21, 22),
+                               .regs = {
+                                       .gpr[21] = LONG_MIN | (uint)INT_MIN,
+                                       .gpr[22] = LONG_MIN | (uint)INT_MIN,
+                               }
+                       }
+               }
+       }
+};
+
+static int __init emulate_compute_instr(struct pt_regs *regs,
+                                       unsigned int instr)
+{
+       struct instruction_op op;
+
+       if (!regs || !instr)
+               return -EINVAL;
+
+       if (analyse_instr(&op, regs, instr) != 1 ||
+           GETTYPE(op.type) != COMPUTE) {
+               pr_info("emulation failed, instruction = 0x%08x\n", instr);
+               return -EFAULT;
+       }
+
+       emulate_update_regs(regs, &op);
+       return 0;
+}
+
+static int __init execute_compute_instr(struct pt_regs *regs,
+                                       unsigned int instr)
+{
+       extern int exec_instr(struct pt_regs *regs);
+       extern s32 patch__exec_instr;
+
+       if (!regs || !instr)
+               return -EINVAL;
+
+       /* Patch the NOP with the actual instruction */
+       patch_instruction_site(&patch__exec_instr, instr);
+       if (exec_instr(regs)) {
+               pr_info("execution failed, instruction = 0x%08x\n", instr);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+#define gpr_mismatch(gprn, exp, got)   \
+       pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n",     \
+               gprn, exp, got)
+
+#define reg_mismatch(name, exp, got)   \
+       pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n",        \
+               name, exp, got)
+
+static void __init run_tests_compute(void)
+{
+       unsigned long flags;
+       struct compute_test *test;
+       struct pt_regs *regs, exp, got;
+       unsigned int i, j, k, instr;
+       bool ignore_gpr, ignore_xer, ignore_ccr, passed;
+
+       for (i = 0; i < ARRAY_SIZE(compute_tests); i++) {
+               test = &compute_tests[i];
+
+               for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) {
+                       instr = test->subtests[j].instr;
+                       flags = test->subtests[j].flags;
+                       regs = &test->subtests[j].regs;
+                       ignore_xer = flags & IGNORE_XER;
+                       ignore_ccr = flags & IGNORE_CCR;
+                       passed = true;
+
+                       memcpy(&exp, regs, sizeof(struct pt_regs));
+                       memcpy(&got, regs, sizeof(struct pt_regs));
+
+                       /*
+                        * Set a compatible MSR value explicitly to ensure
+                        * that XER and CR bits are updated appropriately
+                        */
+                       exp.msr = MSR_KERNEL;
+                       got.msr = MSR_KERNEL;
+
+                       if (emulate_compute_instr(&got, instr) ||
+                           execute_compute_instr(&exp, instr)) {
+                               passed = false;
+                               goto print;
+                       }
+
+                       /* Verify GPR values */
+                       for (k = 0; k < 32; k++) {
+                               ignore_gpr = flags & IGNORE_GPR(k);
+                               if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) {
+                                       passed = false;
+                                       gpr_mismatch(k, exp.gpr[k], got.gpr[k]);
+                               }
+                       }
+
+                       /* Verify LR value */
+                       if (exp.link != got.link) {
+                               passed = false;
+                               reg_mismatch("LR", exp.link, got.link);
+                       }
+
+                       /* Verify XER value */
+                       if (!ignore_xer && exp.xer != got.xer) {
+                               passed = false;
+                               reg_mismatch("XER", exp.xer, got.xer);
+                       }
+
+                       /* Verify CR value */
+                       if (!ignore_ccr && exp.ccr != got.ccr) {
+                               passed = false;
+                               reg_mismatch("CR", exp.ccr, got.ccr);
+                       }
+
+print:
+                       show_result_with_descr(test->mnemonic,
+                                              test->subtests[j].descr,
+                                              passed ? "PASS" : "FAIL");
+               }
+       }
+}
+
+static int __init test_emulate_step(void)
+{
+       printk(KERN_INFO "Running instruction emulation self-tests ...\n");
+       run_tests_load_store();
+       run_tests_compute();
 
        return 0;
 }
diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S
new file mode 100644 (file)
index 0000000..1580f34
--- /dev/null
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Non-emulated single-stepping support (currently limited to basic integer
+ * computations) used to validate the instruction emulation infrastructure.
+ *
+ * Copyright (C) 2019 IBM Corporation
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/code-patching-asm.h>
+#include <linux/errno.h>
+
+/* int exec_instr(struct pt_regs *regs) */
+_GLOBAL(exec_instr)
+
+       /*
+        * Stack frame layout (INT_FRAME_SIZE bytes)
+        *   In-memory pt_regs  (SP + STACK_FRAME_OVERHEAD)
+        *   Scratch space      (SP + 8)
+        *   Back chain         (SP + 0)
+        */
+
+       /*
+        * Allocate a new stack frame with enough space to hold the register
+        * states in an in-memory pt_regs and also create the back chain to
+        * the caller's stack frame.
+        */
+       stdu    r1, -INT_FRAME_SIZE(r1)
+
+       /*
+        * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2)
+        * and local variables (GPR14 to GPR31). The register for the pt_regs
+        * parameter (GPR3) is saved additionally to ensure that the resulting
+        * register state can still be saved even if GPR3 gets overwritten
+        * when loading the initial register state for the test instruction.
+        * The stack pointer (GPR1) and the thread pointer (GPR13) are not
+        * saved as these should not be modified anyway.
+        */
+       SAVE_2GPRS(2, r1)
+       SAVE_NVGPRS(r1)
+
+       /*
+        * Save LR on stack to ensure that the return address is available
+        * even if it gets overwritten by the test instruction.
+        */
+       mflr    r0
+       std     r0, _LINK(r1)
+
+       /*
+        * Save CR on stack. For simplicity, the entire register is saved
+        * even though only fields 2 to 4 are non-volatile.
+        */
+       mfcr    r0
+       std     r0, _CCR(r1)
+
+       /*
+        * Load register state for the test instruction without touching the
+        * critical non-volatile registers. The register state is passed as a
+        * pointer to a pt_regs instance.
+        */
+       subi    r31, r3, GPR0
+
+       /* Load LR from pt_regs */
+       ld      r0, _LINK(r31)
+       mtlr    r0
+
+       /* Load CR from pt_regs */
+       ld      r0, _CCR(r31)
+       mtcr    r0
+
+       /* Load XER from pt_regs */
+       ld      r0, _XER(r31)
+       mtxer   r0
+
+       /* Load GPRs from pt_regs */
+       REST_GPR(0, r31)
+       REST_10GPRS(2, r31)
+       REST_GPR(12, r31)
+       REST_NVGPRS(r31)
+
+       /* Placeholder for the test instruction */
+1:     nop
+       patch_site 1b patch__exec_instr
+
+       /*
+        * Since GPR3 is overwritten, temporarily restore it back to its
+        * original state, i.e. the pointer to pt_regs, to ensure that the
+        * resulting register state can be saved. Before doing this, a copy
+        * of it is created in the scratch space which is used later on to
+        * save it to pt_regs.
+        */
+       std     r3, 8(r1)
+       REST_GPR(3, r1)
+
+       /* Save resulting GPR state to pt_regs */
+       subi    r3, r3, GPR0
+       SAVE_GPR(0, r3)
+       SAVE_GPR(2, r3)
+       SAVE_8GPRS(4, r3)
+       SAVE_GPR(12, r3)
+       SAVE_NVGPRS(r3)
+
+       /* Save resulting LR to pt_regs */
+       mflr    r0
+       std     r0, _LINK(r3)
+
+       /* Save resulting CR to pt_regs */
+       mfcr    r0
+       std     r0, _CCR(r3)
+
+       /* Save resulting XER to pt_regs */
+       mfxer   r0
+       std     r0, _XER(r3)
+
+       /* Restore resulting GPR3 from scratch space and save it to pt_regs */
+       ld      r0, 8(r1)
+       std     r0, GPR3(r3)
+
+       /* Set return value to denote execution success */
+       li      r3, 0
+
+       /* Continue */
+       b       3f
+
+       /* Set return value to denote execution failure */
+2:     li      r3, -EFAULT
+
+       /* Restore the non-volatile GPRs from stack */
+3:     REST_GPR(2, r1)
+       REST_NVGPRS(r1)
+
+       /* Restore LR from stack to be able to return */
+       ld      r0, _LINK(r1)
+       mtlr    r0
+
+       /* Restore CR from stack */
+       ld      r0, _CCR(r1)
+       mtcr    r0
+
+       /* Tear down stack frame */
+       addi    r1, r1, INT_FRAME_SIZE
+
+       /* Return */
+       blr
+
+       /* Setup exception table */
+       EX_TABLE(1b, 2b)
+
+_ASM_NOKPROBE_SYMBOL(exec_instr)
index 494df26c59885b97fd648ccc2e26a921570d6b69..a8794032f15ff35419b2ca25bd94009d18667662 100644 (file)
@@ -17,4 +17,4 @@ obj-$(CONFIG_SPE)             += math_efp.o
 CFLAGS_fabs.o = -fno-builtin-fabs
 CFLAGS_math.o = -fno-builtin-fabs
 
-ccflags-y = -I. -Iinclude/math-emu -w
+ccflags-y = -w
index 61ac468c87c66643df2db4c40e4da18d9098e61e..b9cf6f8764b0075ee5250b0aeb52ae9367f060f2 100644 (file)
@@ -93,7 +93,7 @@ void __init MMU_init_hw(void)
 #define LARGE_PAGE_SIZE_16M    (1<<24)
 #define LARGE_PAGE_SIZE_4M     (1<<22)
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 {
        unsigned long v, s, mapped;
        phys_addr_t p;
index ea2b9af08a48e924794c058321e2e83c72aecc3d..aad127acdbaaa662be29222a2f48d94290bc9883 100644 (file)
@@ -170,7 +170,7 @@ void __init MMU_init_hw(void)
        flush_instruction_cache();
 }
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 {
        unsigned long addr;
        unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
index bfa503cff35114b72bcee40c32365c17af935696..fe1f6443d57f23888a329febb778b13f82e1699d 100644 (file)
@@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa)
 void __init MMU_init_hw(void)
 {
        /* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB_DATA
-       unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
-       unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
-#ifdef CONFIG_PIN_TLB_IMMR
-       int i = 29;
-#else
-       int i = 28;
-#endif
-       unsigned long addr = 0;
-       unsigned long mem = total_lowmem;
-
-       for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
-               mtspr(SPRN_MD_CTR, ctr | (i << 8));
-               mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
-               mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
-               mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
-               addr += LARGE_PAGE_SIZE_8M;
-               mem -= LARGE_PAGE_SIZE_8M;
+       if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
+               unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
+               unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
+               int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
+               unsigned long addr = 0;
+               unsigned long mem = total_lowmem;
+
+               for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
+                       mtspr(SPRN_MD_CTR, ctr | (i << 8));
+                       mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
+                       mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
+                       mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
+                       addr += LARGE_PAGE_SIZE_8M;
+                       mem -= LARGE_PAGE_SIZE_8M;
+               }
        }
-#endif
 }
 
 static void __init mmu_mapin_immr(void)
@@ -98,26 +94,36 @@ static void __init mmu_mapin_immr(void)
                map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
 }
 
-static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
+static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
 {
        modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
 }
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+static void mmu_patch_addis(s32 *site, long simm)
+{
+       unsigned int instr = *(unsigned int *)patch_site_addr(site);
+
+       instr &= 0xffff0000;
+       instr |= ((unsigned long)simm) >> 16;
+       patch_instruction_site(site, instr);
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 {
        unsigned long mapped;
 
        if (__map_without_ltlbs) {
                mapped = 0;
                mmu_mapin_immr();
-#ifndef CONFIG_PIN_TLB_IMMR
-               patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
-#endif
-#ifndef CONFIG_PIN_TLB_TEXT
-               mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
-#endif
+               if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
+                       patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
+               if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+                       mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
        } else {
                mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+               if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+                       mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
+                                           _ALIGN(__pa(_einittext), 8 << 20));
        }
 
        mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
@@ -138,6 +144,26 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
        return mapped;
 }
 
+void mmu_mark_initmem_nx(void)
+{
+       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
+               mmu_patch_addis(&patch__itlbmiss_linmem_top8,
+                               -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
+       if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+               mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+       if (CONFIG_DATA_SHIFT < 23)
+               mmu_patch_addis(&patch__dtlbmiss_romem_top8,
+                               -__pa(((unsigned long)_sinittext) &
+                                     ~(LARGE_PAGE_SIZE_8M - 1)));
+       mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
+}
+#endif
+
 void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
                                       phys_addr_t first_memblock_size)
 {
@@ -146,8 +172,8 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
         */
        BUG_ON(first_memblock_base != 0);
 
-       /* 8xx can only access 24MB at the moment */
-       memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+       /* 8xx can only access 32MB at the moment */
+       memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
 }
 
 /*
@@ -162,14 +188,11 @@ void set_context(unsigned long id, pgd_t *pgd)
 {
        s16 offset = (s16)(__pa(swapper_pg_dir));
 
-#ifdef CONFIG_BDI_SWITCH
-       pgd_t   **ptr = *(pgd_t ***)(KERNELBASE + 0xf0);
-
        /* Context switch the PTE pointer for the Abatron BDI2000.
         * The PGDIR is passed as second argument.
         */
-       *(ptr + 1) = pgd;
-#endif
+       if (IS_ENABLED(CONFIG_BDI_SWITCH))
+               abatron_pteptrs[1] = pgd;
 
        /* Register M_TWB will contain base address of level 1 table minus the
         * lower part of the kernel PGDIR base address, so that all accesses to
index f965fc33a8b77ce47714c3a57fc8ccd12deae11e..d52ec118e09db842283a1bd9607f727711a92f35 100644 (file)
@@ -45,13 +45,10 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_iommu.o
-obj-$(CONFIG_PPC_PTDUMP)       += dump_linuxpagetables.o
-ifdef CONFIG_PPC_PTDUMP
-obj-$(CONFIG_4xx)              += dump_linuxpagetables-generic.o
-obj-$(CONFIG_PPC_8xx)          += dump_linuxpagetables-8xx.o
-obj-$(CONFIG_PPC_BOOK3E_MMU)   += dump_linuxpagetables-generic.o
-obj-$(CONFIG_PPC_BOOK3S_32)    += dump_linuxpagetables-generic.o dump_bats.o dump_sr.o
-obj-$(CONFIG_PPC_BOOK3S_64)    += dump_linuxpagetables-book3s64.o
-endif
-obj-$(CONFIG_PPC_HTDUMP)       += dump_hashpagetable.o
+obj-$(CONFIG_PPC_PTDUMP)       += ptdump/
 obj-$(CONFIG_PPC_MEM_KEYS)     += pkeys.o
+
+# Disable kcov instrumentation on sensitive code
+# This is necessary for booting with kcov enabled on book3e machines
+KCOV_INSTRUMENT_tlb_nohash.o := n
+KCOV_INSTRUMENT_fsl_booke_mmu.o := n
index e955539686a41e0fa5c89cc79671c170c35f6d66..b5d2658c26afb783d9e2900f8f09d7be0d36f500 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 #include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/export.h>
 
 #include <asm/tlbflush.h>
@@ -151,8 +152,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi
  * Allocate DMA-coherent memory space and return both the kernel remapped
  * virtual and bus address for that space.
  */
-void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
-               dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+               gfp_t gfp, unsigned long attrs)
 {
        struct page *page;
        struct ppc_vm_region *c;
@@ -253,7 +254,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
 /*
  * free a page as defined by the above mapping.
  */
-void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
                dma_addr_t dma_handle, unsigned long attrs)
 {
        struct ppc_vm_region *c;
@@ -313,7 +314,7 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
 /*
  * make an area consistent.
  */
-void __dma_sync(void *vaddr, size_t size, int direction)
+static void __dma_sync(void *vaddr, size_t size, int direction)
 {
        unsigned long start = (unsigned long)vaddr;
        unsigned long end   = start + size;
@@ -339,7 +340,6 @@ void __dma_sync(void *vaddr, size_t size, int direction)
                break;
        }
 }
-EXPORT_SYMBOL(__dma_sync);
 
 #ifdef CONFIG_HIGHMEM
 /*
@@ -386,28 +386,42 @@ static inline void __dma_sync_page_highmem(struct page *page,
  * __dma_sync_page makes memory consistent. identical to __dma_sync, but
  * takes a struct page instead of a virtual address
  */
-void __dma_sync_page(struct page *page, unsigned long offset,
-       size_t size, int direction)
+static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
 {
+       struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+       unsigned offset = paddr & ~PAGE_MASK;
+
 #ifdef CONFIG_HIGHMEM
-       __dma_sync_page_highmem(page, offset, size, direction);
+       __dma_sync_page_highmem(page, offset, size, dir);
 #else
        unsigned long start = (unsigned long)page_address(page) + offset;
-       __dma_sync((void *)start, size, direction);
+       __dma_sync((void *)start, size, dir);
 #endif
 }
-EXPORT_SYMBOL(__dma_sync_page);
+
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+               size_t size, enum dma_data_direction dir)
+{
+       __dma_sync_page(paddr, size, dir);
+}
+
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+               size_t size, enum dma_data_direction dir)
+{
+       __dma_sync_page(paddr, size, dir);
+}
 
 /*
- * Return the PFN for a given cpu virtual address returned by
- * __dma_nommu_alloc_coherent. This is used by dma_mmap_coherent()
+ * Return the PFN for a given cpu virtual address returned by arch_dma_alloc.
  */
-unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
+long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr,
+               dma_addr_t dma_addr)
 {
        /* This should always be populated, so we don't test every
         * level. If that fails, we'll have a nice crash which
         * will be as good as a BUG_ON()
         */
+       unsigned long cpu_addr = (unsigned long)vaddr;
        pgd_t *pgd = pgd_offset_k(cpu_addr);
        pud_t *pud = pud_offset(pgd, cpu_addr);
        pmd_t *pmd = pmd_offset(pud, cpu_addr);
index 080d49b26c3a7d3b06326482f4f7b2d4402de5ad..210cbc1faf6389c50b8480c050cb0c40a474f70a 100644 (file)
@@ -221,7 +221,7 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
 #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
 #endif
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
 {
        return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
 }
index 1e2df3e9f9ea0f466ce25a43fec05f9dfa82a35d..1f13494efb2bfa9b50996ce6fda0711b9e933008 100644 (file)
@@ -47,14 +47,13 @@ mmu_hash_lock:
  * Returns to the caller if the access is illegal or there is no
  * mapping for the address.  Otherwise it places an appropriate PTE
  * in the hash table and returns from the exception.
- * Uses r0, r3 - r8, r10, ctr, lr.
+ * Uses r0, r3 - r6, r8, r10, ctr, lr.
  */
        .text
 _GLOBAL(hash_page)
-       tophys(r7,0)                    /* gets -KERNELBASE into r7 */
 #ifdef CONFIG_SMP
-       addis   r8,r7,mmu_hash_lock@h
-       ori     r8,r8,mmu_hash_lock@l
+       lis     r8, (mmu_hash_lock - PAGE_OFFSET)@h
+       ori     r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
        lis     r0,0x0fff
        b       10f
 11:    lwz     r6,0(r8)
@@ -70,14 +69,13 @@ _GLOBAL(hash_page)
        /* Get PTE (linux-style) and check access */
        lis     r0,KERNELBASE@h         /* check if kernel address */
        cmplw   0,r4,r0
-       mfspr   r8,SPRN_SPRG_THREAD     /* current task's THREAD (phys) */
        ori     r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
-       lwz     r5,PGDIR(r8)            /* virt page-table root */
+       mfspr   r5, SPRN_SPRG_PGDIR     /* virt page-table root */
        blt+    112f                    /* assume user more likely */
        lis     r5,swapper_pg_dir@ha    /* if kernel address, use */
        addi    r5,r5,swapper_pg_dir@l  /* kernel page table */
        rlwimi  r3,r9,32-12,29,29       /* MSR_PR -> _PAGE_USER */
-112:   add     r5,r5,r7                /* convert to phys addr */
+112:   tophys(r5, r5)
 #ifndef CONFIG_PTE_64BIT
        rlwimi  r5,r4,12,20,29          /* insert top 10 bits of address */
        lwz     r8,0(r5)                /* get pmd entry */
@@ -144,25 +142,24 @@ retry:
 
 #ifdef CONFIG_SMP
        eieio
-       addis   r8,r7,mmu_hash_lock@ha
+       lis     r8, (mmu_hash_lock - PAGE_OFFSET)@ha
        li      r0,0
-       stw     r0,mmu_hash_lock@l(r8)
+       stw     r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
 #endif
 
        /* Return from the exception */
        lwz     r5,_CTR(r11)
        mtctr   r5
        lwz     r0,GPR0(r11)
-       lwz     r7,GPR7(r11)
        lwz     r8,GPR8(r11)
        b       fast_exception_return
 
 #ifdef CONFIG_SMP
 hash_page_out:
        eieio
-       addis   r8,r7,mmu_hash_lock@ha
+       lis     r8, (mmu_hash_lock - PAGE_OFFSET)@ha
        li      r0,0
-       stw     r0,mmu_hash_lock@l(r8)
+       stw     r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
        blr
 #endif /* CONFIG_SMP */
 
@@ -186,8 +183,7 @@ _GLOBAL(add_hash_page)
        add     r3,r3,r0                /* note create_hpte trims to 24 bits */
 
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r8, r1)     /* use cpu number to make tag */
-       lwz     r8,TI_CPU(r8)           /* to go in mmu_hash_lock */
+       lwz     r8,TASK_CPU(r2)         /* to go in mmu_hash_lock */
        oris    r8,r8,12
 #endif /* CONFIG_SMP */
 
@@ -208,11 +204,9 @@ _GLOBAL(add_hash_page)
        SYNC_601
        isync
 
-       tophys(r7,0)
-
 #ifdef CONFIG_SMP
-       addis   r6,r7,mmu_hash_lock@ha
-       addi    r6,r6,mmu_hash_lock@l
+       lis     r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+       addi    r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
 10:    lwarx   r0,0,r6                 /* take the mmu_hash_lock */
        cmpi    0,r0,0
        bne-    11f
@@ -257,8 +251,8 @@ _GLOBAL(add_hash_page)
 
 9:
 #ifdef CONFIG_SMP
-       addis   r6,r7,mmu_hash_lock@ha
-       addi    r6,r6,mmu_hash_lock@l
+       lis     r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+       addi    r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
        eieio
        li      r0,0
        stw     r0,0(r6)                /* clear mmu_hash_lock */
@@ -278,10 +272,8 @@ _GLOBAL(add_hash_page)
  * It is designed to be called with the MMU either on or off.
  * r3 contains the VSID, r4 contains the virtual address,
  * r5 contains the linux PTE, r6 contains the old value of the
- * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
- * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).  r10 contains the upper half of
- * the PTE if CONFIG_PTE_64BIT.
+ * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
+ * upper half of the PTE if CONFIG_PTE_64BIT.
  * On SMP, the caller should have the mmu_hash_lock held.
  * We assume that the caller has (or will) set the _PAGE_HASHPTE
  * bit in the linux PTE in memory.  The value passed in r6 should
@@ -342,7 +334,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
        patch_site      1f, patch__hash_page_A1
        patch_site      2f, patch__hash_page_A2
        /* Get the address of the primary PTE group in the hash table (r3) */
-0:     addis   r0,r7,Hash_base@h       /* base address of hash table */
+0:     lis     r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
 1:     rlwimi  r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
 2:     rlwinm  r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
        xor     r3,r3,r0                /* make primary hash */
@@ -356,10 +348,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
        beq+    10f                     /* no PTE: go look for an empty slot */
        tlbie   r4
 
-       addis   r4,r7,htab_hash_searches@ha
-       lwz     r6,htab_hash_searches@l(r4)
+       lis     r4, (htab_hash_searches - PAGE_OFFSET)@ha
+       lwz     r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
        addi    r6,r6,1                 /* count how many searches we do */
-       stw     r6,htab_hash_searches@l(r4)
+       stw     r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
 
        /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
        mtctr   r0
@@ -391,10 +383,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
        beq+    found_empty
 
        /* update counter of times that the primary PTEG is full */
-       addis   r4,r7,primary_pteg_full@ha
-       lwz     r6,primary_pteg_full@l(r4)
+       lis     r4, (primary_pteg_full - PAGE_OFFSET)@ha
+       lwz     r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
        addi    r6,r6,1
-       stw     r6,primary_pteg_full@l(r4)
+       stw     r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
 
        patch_site      0f, patch__hash_page_C
        /* Search the secondary PTEG for an empty slot */
@@ -428,8 +420,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
         * lockup here but that shouldn't happen
         */
 
-1:     addis   r4,r7,next_slot@ha              /* get next evict slot */
-       lwz     r6,next_slot@l(r4)
+1:     lis     r4, (next_slot - PAGE_OFFSET)@ha        /* get next evict slot */
+       lwz     r6, (next_slot - PAGE_OFFSET)@l(r4)
        addi    r6,r6,HPTE_SIZE                 /* search for candidate */
        andi.   r6,r6,7*HPTE_SIZE
        stw     r6,next_slot@l(r4)
@@ -501,8 +493,6 @@ htab_hash_searches:
  * We assume that there is a hash table in use (Hash != 0).
  */
 _GLOBAL(flush_hash_pages)
-       tophys(r7,0)
-
        /*
         * We disable interrupts here, even on UP, because we want
         * the _PAGE_HASHPTE bit to be a reliable indication of
@@ -547,11 +537,9 @@ _GLOBAL(flush_hash_pages)
        SET_V(r11)                      /* set V (valid) bit */
 
 #ifdef CONFIG_SMP
-       addis   r9,r7,mmu_hash_lock@ha
-       addi    r9,r9,mmu_hash_lock@l
-       CURRENT_THREAD_INFO(r8, r1)
-       add     r8,r8,r7
-       lwz     r8,TI_CPU(r8)
+       lis     r9, (mmu_hash_lock - PAGE_OFFSET)@ha
+       addi    r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
+       lwz     r8,TASK_CPU(r2)
        oris    r8,r8,9
 10:    lwarx   r0,0,r9
        cmpi    0,r0,0
@@ -584,7 +572,7 @@ _GLOBAL(flush_hash_pages)
        patch_site      1f, patch__flush_hash_A1
        patch_site      2f, patch__flush_hash_A2
        /* Get the address of the primary PTE group in the hash table (r3) */
-0:     addis   r8,r7,Hash_base@h       /* base address of hash table */
+0:     lis     r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
 1:     rlwimi  r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
 2:     rlwinm  r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
        xor     r8,r0,r8                /* make primary hash */
@@ -646,8 +634,7 @@ EXPORT_SYMBOL(flush_hash_pages)
  */
 _GLOBAL(_tlbie)
 #ifdef CONFIG_SMP
-       CURRENT_THREAD_INFO(r8, r1)
-       lwz     r8,TI_CPU(r8)
+       lwz     r8,TASK_CPU(r2)
        oris    r8,r8,11
        mfmsr   r10
        SYNC
@@ -684,8 +671,7 @@ _GLOBAL(_tlbie)
  */
 _GLOBAL(_tlbia)
 #if defined(CONFIG_SMP)
-       CURRENT_THREAD_INFO(r8, r1)
-       lwz     r8,TI_CPU(r8)
+       lwz     r8,TASK_CPU(r2)
        oris    r8,r8,10
        mfmsr   r10
        SYNC
index bc6be44913d44959ff16a868f9050524389548bb..3d4b2399192f89359c2ca343d97b3e07a11e7eca 100644 (file)
@@ -1889,12 +1889,12 @@ static int hpt_order_set(void *data, u64 val)
        return mmu_hash_ops.resize_hpt(val);
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
 
 static int __init hash64_debugfs(void)
 {
-       if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root,
-                                NULL, &fops_hpt_order)) {
+       if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
+                                       NULL, &fops_hpt_order)) {
                pr_err("lpar: unable to create hpt_order debugsfs file\n");
        }
 
index 367ce3a4a50395faf636ce72fe78b2a3260b7b84..b0d9209d9a86fabb963c12c10447e59234499dbe 100644 (file)
@@ -26,7 +26,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
        real_pte_t rpte;
        unsigned long vpn;
        unsigned long old_pte, new_pte;
-       unsigned long rflags, pa, sz;
+       unsigned long rflags, pa;
        long slot, offset;
 
        BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
@@ -73,7 +73,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                offset = PTRS_PER_PMD;
        rpte = __real_pte(__pte(old_pte), ptep, offset);
 
-       sz = ((1UL) << shift);
        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
                /* No CPU has hugepages but lacks no execute, so we
                 * don't need to worry about that case */
index 11d9ea28a816ac2fcf88bf19910ba77298bd004d..cab06331c0c09478fbba25beac0ebb29cba5d3b1 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/security.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
@@ -73,7 +74,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        if (addr) {
                addr = ALIGN(addr, huge_page_size(h));
                vma = find_vma(mm, addr);
-               if (high_limit - len >= addr &&
+               if (high_limit - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)))
                        return addr;
        }
@@ -83,7 +84,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         */
        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
-       info.low_limit = PAGE_SIZE;
+       info.low_limit = max(PAGE_SIZE, mmap_min_addr);
        info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
index 3e59e5d64b014d11b2d542e6eddc48b6a93e1b3b..41a3513cadc907a48cb27e4d1e97638421510377 100644 (file)
@@ -108,12 +108,8 @@ static void __init MMU_setup(void)
                __map_without_bats = 1;
                __map_without_ltlbs = 1;
        }
-#ifdef CONFIG_STRICT_KERNEL_RWX
-       if (rodata_enabled) {
-               __map_without_bats = 1;
+       if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx))
                __map_without_ltlbs = 1;
-       }
-#endif
 }
 
 /*
index a5091c03474753111f77df8de2910152ee38abb8..a4c155af159756b85cd2432d22e5949126de0621 100644 (file)
@@ -274,7 +274,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
 
        for (; start < end; start += page_size) {
                unsigned long nr_pages, addr;
-               struct page *section_base;
                struct page *page;
 
                /*
@@ -290,7 +289,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
                        continue;
 
                page = pfn_to_page(addr >> PAGE_SHIFT);
-               section_base = pfn_to_page(vmemmap_section_start(start));
                nr_pages = 1 << page_order;
                base_pfn = PHYS_PFN(addr);
 
index 33cc6f676fa6224b76fe0f37399a30b2d852b449..f6787f90e1585bb735e38ee85a869e865486f9af 100644 (file)
@@ -69,22 +69,14 @@ pte_t *kmap_pte;
 EXPORT_SYMBOL(kmap_pte);
 pgprot_t kmap_prot;
 EXPORT_SYMBOL(kmap_prot);
-#define TOP_ZONE ZONE_HIGHMEM
 
 static inline pte_t *virt_to_kpte(unsigned long vaddr)
 {
        return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
                        vaddr), vaddr), vaddr);
 }
-#else
-#define TOP_ZONE ZONE_NORMAL
 #endif
 
-int page_is_ram(unsigned long pfn)
-{
-       return memblock_is_memory(__pfn_to_phys(pfn));
-}
-
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot)
 {
@@ -176,34 +168,6 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size,
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * walk_memory_resource() needs to make sure there is no holes in a given
- * memory range.  PPC64 does not maintain the memory layout in /proc/iomem.
- * Instead it maintains it in memblock.memory structures.  Walk through the
- * memory regions, find holes and callback for contiguous regions.
- */
-int
-walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
-               void *arg, int (*func)(unsigned long, unsigned long, void *))
-{
-       struct memblock_region *reg;
-       unsigned long end_pfn = start_pfn + nr_pages;
-       unsigned long tstart, tend;
-       int ret = -1;
-
-       for_each_memblock(memory, reg) {
-               tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
-               tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
-               if (tstart >= tend)
-                       continue;
-               ret = (*func)(tstart, tend - tstart, arg);
-               if (ret)
-                       break;
-       }
-       return ret;
-}
-EXPORT_SYMBOL_GPL(walk_system_ram_range);
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init mem_topology_setup(void)
 {
@@ -261,25 +225,6 @@ static int __init mark_nonram_nosave(void)
  */
 static unsigned long max_zone_pfns[MAX_NR_ZONES];
 
-/*
- * Find the least restrictive zone that is entirely below the
- * specified pfn limit.  Returns < 0 if no suitable zone is found.
- *
- * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
- * systems -- the DMA limit can be higher than any possible real pfn.
- */
-int dma_pfn_limit_to_zone(u64 pfn_limit)
-{
-       int i;
-
-       for (i = TOP_ZONE; i >= 0; i--) {
-               if (max_zone_pfns[i] <= pfn_limit)
-                       return i;
-       }
-
-       return -EPERM;
-}
-
 /*
  * paging_init() sets up the page tables - in fact we've already done this.
  */
@@ -585,3 +530,9 @@ int devmem_is_allowed(unsigned long pfn)
        return 0;
 }
 #endif /* CONFIG_STRICT_DEVMEM */
+
+/*
+ * This is defined in kernel/resource.c but only powerpc needs to export it, for
+ * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
+ */
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
index c4a717da65eb8d6b0c2419cdf06f1a647985c6e0..74ff61dabcb1dddb1088cff3a1f6e14561b9ead5 100644 (file)
@@ -130,7 +130,7 @@ extern void wii_memory_fixups(void);
  */
 #ifdef CONFIG_PPC32
 extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(unsigned long top);
+unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
 #endif
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
@@ -165,3 +165,11 @@ unsigned long p_block_mapped(phys_addr_t pa);
 static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
 static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
 #endif
+
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
+void mmu_mark_initmem_nx(void);
+void mmu_mark_rodata_ro(void);
+#else
+static inline void mmu_mark_initmem_nx(void) { }
+static inline void mmu_mark_rodata_ro(void) { }
+#endif
index df1e11ebbabbfb9647a7f81bfd41d98259212f03..ac49e4158e50880ee5bdc86af956f728b533f250 100644 (file)
@@ -1460,13 +1460,6 @@ static void reset_topology_timer(void)
 
 #ifdef CONFIG_SMP
 
-static void stage_topology_update(int core_id)
-{
-       cpumask_or(&cpu_associativity_changes_mask,
-               &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
-       reset_topology_timer();
-}
-
 static int dt_update_callback(struct notifier_block *nb,
                                unsigned long action, void *data)
 {
@@ -1479,7 +1472,7 @@ static int dt_update_callback(struct notifier_block *nb,
                    !of_prop_cmp(update->prop->name, "ibm,associativity")) {
                        u32 core_id;
                        of_property_read_u32(update->dn, "reg", &core_id);
-                       stage_topology_update(core_id);
+                       rc = dlpar_cpu_readd(core_id);
                        rc = NOTIFY_OK;
                }
                break;
index ded71126ce4c6784d1e8aad82fd51081e88ec44c..6e56a6240bfa4da1ee1815bb01835c596fad5e1a 100644 (file)
@@ -254,26 +254,20 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 
 void __init mapin_ram(void)
 {
-       unsigned long s, top;
-
-#ifndef CONFIG_WII
-       top = total_lowmem;
-       s = mmu_mapin_ram(top);
-       __mapin_ram_chunk(s, top);
-#else
-       if (!wii_hole_size) {
-               s = mmu_mapin_ram(total_lowmem);
-               __mapin_ram_chunk(s, total_lowmem);
-       } else {
-               top = wii_hole_start;
-               s = mmu_mapin_ram(top);
-               __mapin_ram_chunk(s, top);
-
-               top = memblock_end_of_DRAM();
-               s = wii_mmu_mapin_mem2(top);
-               __mapin_ram_chunk(s, top);
+       struct memblock_region *reg;
+
+       for_each_memblock(memory, reg) {
+               phys_addr_t base = reg->base;
+               phys_addr_t top = min(base + reg->size, total_lowmem);
+
+               if (base >= top)
+                       continue;
+               base = mmu_mapin_ram(base, top);
+               if (IS_ENABLED(CONFIG_BDI_SWITCH))
+                       __mapin_ram_chunk(reg->base, top);
+               else
+                       __mapin_ram_chunk(base, top);
        }
-#endif
 }
 
 /* Scan the real Linux page tables and return a PTE pointer for
@@ -359,7 +353,10 @@ void mark_initmem_nx(void)
        unsigned long numpages = PFN_UP((unsigned long)_einittext) -
                                 PFN_DOWN((unsigned long)_sinittext);
 
-       change_page_attr(page, numpages, PAGE_KERNEL);
+       if (v_block_mapped((unsigned long)_stext) + 1)
+               mmu_mark_initmem_nx();
+       else
+               change_page_attr(page, numpages, PAGE_KERNEL);
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
@@ -368,6 +365,11 @@ void mark_rodata_ro(void)
        struct page *page;
        unsigned long numpages;
 
+       if (v_block_mapped((unsigned long)_sinittext)) {
+               mmu_mark_rodata_ro();
+               return;
+       }
+
        page = virt_to_page(_stext);
        numpages = PFN_UP((unsigned long)_etext) -
                   PFN_DOWN((unsigned long)_stext);
index 36a664f06c655f0840791291019575ad9aa2c0a9..6c8a60b1e31dd4c2dab22e49d82fcbda3f5d6d12 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/mmu.h>
 #include <asm/machdep.h>
 #include <asm/code-patching.h>
+#include <asm/sections.h>
 
 #include "mmu_decl.h"
 
@@ -73,45 +74,171 @@ unsigned long p_block_mapped(phys_addr_t pa)
        return 0;
 }
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+static int find_free_bat(void)
 {
-       unsigned long tot, bl, done;
-       unsigned long max_size = (256<<20);
+       int b;
+
+       if (cpu_has_feature(CPU_FTR_601)) {
+               for (b = 0; b < 4; b++) {
+                       struct ppc_bat *bat = BATS[b];
+
+                       if (!(bat[0].batl & 0x40))
+                               return b;
+               }
+       } else {
+               int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+
+               for (b = 0; b < n; b++) {
+                       struct ppc_bat *bat = BATS[b];
+
+                       if (!(bat[1].batu & 3))
+                               return b;
+               }
+       }
+       return -1;
+}
+
+static unsigned int block_size(unsigned long base, unsigned long top)
+{
+       unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20;
+       unsigned int base_shift = (fls(base) - 1) & 31;
+       unsigned int block_shift = (fls(top - base) - 1) & 31;
+
+       return min3(max_size, 1U << base_shift, 1U << block_shift);
+}
+
+/*
+ * Set up one of the IBAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ * Only for 603+ ...
+ */
+static void setibat(int index, unsigned long virt, phys_addr_t phys,
+                   unsigned int size, pgprot_t prot)
+{
+       unsigned int bl = (size >> 17) - 1;
+       int wimgxpp;
+       struct ppc_bat *bat = BATS[index];
+       unsigned long flags = pgprot_val(prot);
+
+       if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
+               flags &= ~_PAGE_COHERENT;
+
+       wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
+       bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+       bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+       if (flags & _PAGE_USER)
+               bat[0].batu |= 1;       /* Vp = 1 */
+}
+
+static void clearibat(int index)
+{
+       struct ppc_bat *bat = BATS[index];
+
+       bat[0].batu = 0;
+       bat[0].batl = 0;
+}
+
+static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+       int idx;
+
+       while ((idx = find_free_bat()) != -1 && base != top) {
+               unsigned int size = block_size(base, top);
+
+               if (size < 128 << 10)
+                       break;
+               setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+               base += size;
+       }
+
+       return base;
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+       int done;
+       unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
 
        if (__map_without_bats) {
-               printk(KERN_DEBUG "RAM mapped without BATs\n");
-               return 0;
+               pr_debug("RAM mapped without BATs\n");
+               return base;
+       }
+
+       if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
+               return __mmu_mapin_ram(base, top);
+
+       done = __mmu_mapin_ram(base, border);
+       if (done != border - base)
+               return done;
+
+       return done + __mmu_mapin_ram(border, top);
+}
+
+void mmu_mark_initmem_nx(void)
+{
+       int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+       int i;
+       unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
+       unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+       unsigned long size;
+
+       if (cpu_has_feature(CPU_FTR_601))
+               return;
+
+       for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
+               size = block_size(base, top);
+               setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+               base += size;
        }
+       if (base < top) {
+               size = block_size(base, top);
+               size = max(size, 128UL << 10);
+               if ((top - base) > size) {
+                       if (strict_kernel_rwx_enabled())
+                               pr_warn("Kernel _etext not properly aligned\n");
+                       size <<= 1;
+               }
+               setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+               base += size;
+       }
+       for (; i < nb; i++)
+               clearibat(i);
 
-       /* Set up BAT2 and if necessary BAT3 to cover RAM. */
+       update_bats();
 
-       /* Make sure we don't map a block larger than the
-          smallest alignment of the physical address. */
-       tot = top;
-       for (bl = 128<<10; bl < max_size; bl <<= 1) {
-               if (bl * 2 > tot)
+       for (i = TASK_SIZE >> 28; i < 16; i++) {
+               /* Do not set NX on VM space for modules */
+               if (IS_ENABLED(CONFIG_MODULES) &&
+                   (VMALLOC_START & 0xf0000000) == i << 28)
                        break;
+               mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
        }
+}
+
+void mmu_mark_rodata_ro(void)
+{
+       int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+       int i;
+
+       if (cpu_has_feature(CPU_FTR_601))
+               return;
+
+       for (i = 0; i < nb; i++) {
+               struct ppc_bat *bat = BATS[i];
 
-       setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
-       done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
-       if ((done < tot) && !bat_addrs[3].limit) {
-               /* use BAT3 to cover a bit more */
-               tot -= done;
-               for (bl = 128<<10; bl < max_size; bl <<= 1)
-                       if (bl * 2 > tot)
-                               break;
-               setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
-               done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
+               if (bat_addrs[i].start < (unsigned long)__init_begin)
+                       bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
        }
 
-       return done;
+       update_bats();
 }
 
 /*
  * Set up one of the I/D BAT (block address translation) register pairs.
  * The parameters are not checked; in particular size must be a power
  * of 2 between 128k and 256M.
+ * On 603+, only set IBAT when _PAGE_EXEC is set
  */
 void __init setbat(int index, unsigned long virt, phys_addr_t phys,
                   unsigned int size, pgprot_t prot)
@@ -138,11 +265,12 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
                        bat[1].batu |= 1;       /* Vp = 1 */
                if (flags & _PAGE_GUARDED) {
                        /* G bit must be zero in IBATs */
-                       bat[0].batu = bat[0].batl = 0;
-               } else {
-                       /* make IBAT same as DBAT */
-                       bat[0] = bat[1];
+                       flags &= ~_PAGE_EXEC;
                }
+               if (flags & _PAGE_EXEC)
+                       bat[0] = bat[1];
+               else
+                       bat[0].batu = bat[0].batl = 0;
        } else {
                /* 601 cpu */
                if (bl > BL_8M)
@@ -230,7 +358,8 @@ void __init MMU_init_hw(void)
        if (lg_n_hpteg > 16)
                mb2 = 16 - LG_HPTEG_SIZE;
 
-       modify_instruction_site(&patch__hash_page_A0, 0xffff, (unsigned int)Hash >> 16);
+       modify_instruction_site(&patch__hash_page_A0, 0xffff,
+                               ((unsigned int)Hash - PAGE_OFFSET) >> 16);
        modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6);
        modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6);
        modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
@@ -239,7 +368,8 @@ void __init MMU_init_hw(void)
        /*
         * Patch up the instructions in hashtable.S:flush_hash_page
         */
-       modify_instruction_site(&patch__flush_hash_A0, 0xffff, (unsigned int)Hash >> 16);
+       modify_instruction_site(&patch__flush_hash_A0, 0xffff,
+                               ((unsigned int)Hash - PAGE_OFFSET) >> 16);
        modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6);
        modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6);
        modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
similarity index 97%
rename from arch/powerpc/mm/dump_linuxpagetables-8xx.c
rename to arch/powerpc/mm/ptdump/8xx.c
index ab9e3f24db2f419b9615c78b0762fed0063121c6..9e2d8e847d6e874a9b8d3d16fb5220ff326afd07 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kernel.h>
 #include <asm/pgtable.h>
 
-#include "dump_linuxpagetables.h"
+#include "ptdump.h"
 
 static const struct flag_info flag_array[] = {
        {
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
new file mode 100644 (file)
index 0000000..712762b
--- /dev/null
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y  += ptdump.o
+
+obj-$(CONFIG_4xx)              += shared.o
+obj-$(CONFIG_PPC_8xx)          += 8xx.o
+obj-$(CONFIG_PPC_BOOK3E_MMU)   += shared.o
+obj-$(CONFIG_PPC_BOOK3S_32)    += shared.o bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64)    += book3s64.o hashpagetable.o
similarity index 98%
rename from arch/powerpc/mm/dump_linuxpagetables-book3s64.c
rename to arch/powerpc/mm/ptdump/book3s64.c
index ed6fcf78256eb5fa9e5a9933a8a086a005055437..0dfca72cb9bd5d4a858a5445ba6d9c9b94881711 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kernel.h>
 #include <asm/pgtable.h>
 
-#include "dump_linuxpagetables.h"
+#include "ptdump.h"
 
 static const struct flag_info flag_array[] = {
        {
similarity index 99%
rename from arch/powerpc/mm/dump_hashpagetable.c
rename to arch/powerpc/mm/ptdump/hashpagetable.c
index 86929469504852998d030a4b2de06fe21bf6b151..b430e4e08af69d435f446fadcd2dc2242f51fa8c 100644 (file)
@@ -342,7 +342,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize)
 
        /* Look in secondary table */
        if (slot == -1)
-               slot = base_hpte_find(ea, psize, true, &v, &r);
+               slot = base_hpte_find(ea, psize, false, &v, &r);
 
        /* No entry found */
        if (slot == -1)
similarity index 94%
rename from arch/powerpc/mm/dump_linuxpagetables.c
rename to arch/powerpc/mm/ptdump/ptdump.c
index 6aa41669ac1aec6b0db81da31d45b47bbade30c1..37138428ab5585d42a82a2d78e6e74705431619d 100644 (file)
@@ -28,7 +28,7 @@
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 
-#include "dump_linuxpagetables.h"
+#include "ptdump.h"
 
 #ifdef CONFIG_PPC32
 #define KERN_VIRT_START        0
@@ -143,14 +143,19 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
        unsigned long delta;
 
 #ifdef CONFIG_PPC64
-       seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
-       seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#define REG            "0x%016lx"
 #else
-       seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
-       seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#define REG            "0x%08lx"
 #endif
 
-       delta = (addr - st->start_address) >> 10;
+       seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
+       if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) {
+               seq_printf(st->seq, "[" REG "]", st->start_pa);
+               delta = PAGE_SIZE >> 10;
+       } else {
+               seq_printf(st->seq, " " REG " ", st->start_pa);
+               delta = (addr - st->start_address) >> 10;
+       }
        /* Work out what appropriate unit to use */
        while (!(delta & 1023) && unit[1]) {
                delta >>= 10;
@@ -184,7 +189,8 @@ static void note_page(struct pg_state *st, unsigned long addr,
         */
        } else if (flag != st->current_flags || level != st->level ||
                   addr >= st->marker[1].start_address ||
-                  pa != st->last_pa + PAGE_SIZE) {
+                  (pa != st->last_pa + PAGE_SIZE &&
+                   (pa != st->start_pa || st->start_pa != st->last_pa))) {
 
                /* Check the PTE flags */
                if (st->current_flags) {
similarity index 97%
rename from arch/powerpc/mm/dump_linuxpagetables-generic.c
rename to arch/powerpc/mm/ptdump/shared.c
index 3fe98a0974c62065720bc91de2adf57b2de0834c..f7ed2f187cb01acb307020cdd515f957ff76f6bc 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kernel.h>
 #include <asm/pgtable.h>
 
-#include "dump_linuxpagetables.h"
+#include "ptdump.h"
 
 static const struct flag_info flag_array[] = {
        {
index bc3914d54e26ef8c400c65c92b8c359c171f8207..5986df48359b0a9e6cd08b3926ae0b5b72a69d89 100644 (file)
@@ -69,6 +69,11 @@ static void assert_slb_presence(bool present, unsigned long ea)
        if (!cpu_has_feature(CPU_FTR_ARCH_206))
                return;
 
+       /*
+        * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+        * ignores all other bits from 0-27, so just clear them all.
+        */
+       ea &= ~((1UL << 28) - 1);
        asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
 
        WARN_ON(present == (tmp == 0));
index 06898c13901de85cda6b50a3677b38325f685dd8..aec91dbcdc0b47bef604eeb802cd40e999a48f7c 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/mm.h>
+#include <linux/security.h>
 #include <asm/mman.h>
 #include <asm/mmu.h>
 #include <asm/copro.h>
@@ -377,6 +378,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
        int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
        unsigned long addr, found, prev;
        struct vm_unmapped_area_info info;
+       unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
 
        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
@@ -393,7 +395,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
        if (high_limit > DEFAULT_MAP_WINDOW)
                addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
 
-       while (addr > PAGE_SIZE) {
+       while (addr > min_addr) {
                info.high_limit = addr;
                if (!slice_scan_available(addr - 1, available, 0, &addr))
                        continue;
@@ -405,8 +407,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
                 * Check if we need to reduce the range, or if we can
                 * extend it to cover the previous available slice.
                 */
-               if (addr < PAGE_SIZE)
-                       addr = PAGE_SIZE;
+               if (addr < min_addr)
+                       addr = min_addr;
                else if (slice_scan_available(addr - 1, available, 0, &prev)) {
                        addr = prev;
                        goto prev_slice;
@@ -528,7 +530,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
                addr = _ALIGN_UP(addr, page_size);
                slice_dbg(" aligned addr=%lx\n", addr);
                /* Ignore hint if it's too large or overlaps a VMA */
-               if (addr > high_limit - len ||
+               if (addr > high_limit - len || addr < mmap_min_addr ||
                    !slice_area_is_free(mm, addr, len))
                        addr = 0;
        }
index ae5d568e267f681d43367b19e9b21f7307debfe3..ac23dc1c653520d3e152e9807919f5080c408819 100644 (file)
@@ -302,7 +302,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
         * This function as well as __local_flush_tlb_page() must only be called
         * for user contexts.
         */
-       if (unlikely(WARN_ON(!mm)))
+       if (WARN_ON(!mm))
                return;
 
        preempt_disable();
index 6f4daacad296240c1892e30af7456391541f2659..dc50a8d4b3b972a479aa2b00b1ea1c46db2977e2 100644 (file)
@@ -106,9 +106,8 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh);
        } while (0)
 #else
 #define PPC_BPF_LOAD_CPU(r)     \
-       do { BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4);          \
-               PPC_LHZ_OFFS(r, (1 & ~(THREAD_SIZE - 1)),                       \
-                               offsetof(struct thread_info, cpu));             \
+       do { BUILD_BUG_ON(FIELD_SIZEOF(struct task_struct, cpu) != 4);          \
+               PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu));          \
        } while(0)
 #endif
 #else
index 7de344b7d9cc262472db1c3b3d068ad37e5d556d..063c9d9f25162c025c7d727ee9492511848b6b5c 100644 (file)
@@ -97,3 +97,27 @@ EVENT(PM_MRK_DTLB_MISS_64K,                  0x3d156)
 EVENT(PM_DTLB_MISS_16M,                                0x4c056)
 EVENT(PM_DTLB_MISS_1G,                         0x4c05a)
 EVENT(PM_MRK_DTLB_MISS_16M,                    0x4c15e)
+
+/*
+ * Memory Access Events
+ *
+ * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0)
+ * To enable capturing of memory profiling, these MMCRA bits
+ * needs to be programmed and corresponding raw event format
+ * encoding.
+ *
+ * MMCRA bits encoding needed are
+ *     SM (Sampling Mode)
+ *     EM (Eligibility for Random Sampling)
+ *     TECE (Threshold Event Counter Event)
+ *     TS (Threshold Start Event)
+ *     TE (Threshold End Event)
+ *
+ * Corresponding Raw Encoding bits:
+ *     sample [EM,SM]
+ *     thresh_sel (TECE)
+ *     thresh start (TS)
+ *     thresh end (TE)
+ */
+EVENT(MEM_LOADS,                               0x34340401e0)
+EVENT(MEM_STORES,                              0x343c0401e0)
index 0ff9c43733e97179bacbe9ca406e834c37e9cc7a..030544e35959fe494536617df31d92a4dd0f79bc 100644 (file)
@@ -160,6 +160,8 @@ GENERIC_EVENT_ATTR(branch-instructions,             PM_BR_CMPL);
 GENERIC_EVENT_ATTR(branch-misses,              PM_BR_MPRED_CMPL);
 GENERIC_EVENT_ATTR(cache-references,           PM_LD_REF_L1);
 GENERIC_EVENT_ATTR(cache-misses,               PM_LD_MISS_L1_FIN);
+GENERIC_EVENT_ATTR(mem-loads,                  MEM_LOADS);
+GENERIC_EVENT_ATTR(mem-stores,                 MEM_STORES);
 
 CACHE_EVENT_ATTR(L1-dcache-load-misses,                PM_LD_MISS_L1_FIN);
 CACHE_EVENT_ATTR(L1-dcache-loads,              PM_LD_REF_L1);
@@ -185,6 +187,8 @@ static struct attribute *power9_events_attr[] = {
        GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
        GENERIC_EVENT_PTR(PM_LD_REF_L1),
        GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN),
+       GENERIC_EVENT_PTR(MEM_LOADS),
+       GENERIC_EVENT_PTR(MEM_STORES),
        CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN),
        CACHE_EVENT_PTR(PM_LD_REF_L1),
        CACHE_EVENT_PTR(PM_L1_PREF),
index 4a9a72d01c3c6cc275269c1ef2b1b4ee50044f18..35be81fd2dc23ae31e81650ea8dc47744de72e63 100644 (file)
@@ -180,6 +180,7 @@ config CURRITUCK
        depends on PPC_47x
        select SWIOTLB
        select 476FPE
+       select FORCE_PCI
        select PPC4xx_PCI_EXPRESS
        help
          This option enables support for the IBM Currituck (476fpe) evaluation board
index e55933f9cd55f9e6576cf6ff8c838bb949a47145..a5e61e5c16e27db4b9633c4058b0858f22f91821 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/ppc4xx.h>
 #include <asm/mpic.h>
 #include <asm/mmu.h>
+#include <asm/swiotlb.h>
 
 #include <linux/pci.h>
 #include <linux/i2c.h>
index f467247fd1c45033b49a673caffb3917e1b19a1e..18422dbd061adcec2247a1a7ac468f154546db3c 100644 (file)
@@ -47,7 +47,7 @@ static int __init warp_probe(void)
        if (!of_machine_is_compatible("pika,warp"))
                return 0;
 
-       /* For __dma_nommu_alloc_coherent */
+       /* For arch_dma_alloc */
        ISA_DMA_THRESHOLD = ~0L;
 
        return 1;
index 3d1ecd2117769906c1d69eab8c39776012cff1b3..8137f77abad577503397a56f051ae9f97a71ab5e 100644 (file)
 #define SS_MSR         0x74
 #define SS_SDR1                0x78
 #define SS_LR          0x7c
-#define SS_SPRG                0x80 /* 4 SPRGs */
-#define SS_DBAT                0x90 /* 8 DBATs */
-#define SS_IBAT                0xd0 /* 8 IBATs */
-#define SS_TB          0x110
-#define SS_CR          0x118
-#define SS_GPREG       0x11c /* r12-r31 */
-#define STATE_SAVE_SIZE 0x16c
+#define SS_SPRG                0x80 /* 8 SPRGs */
+#define SS_DBAT                0xa0 /* 8 DBATs */
+#define SS_IBAT                0xe0 /* 8 IBATs */
+#define SS_TB          0x120
+#define SS_CR          0x128
+#define SS_GPREG       0x12c /* r12-r31 */
+#define STATE_SAVE_SIZE 0x17c
 
        .section .data
        .align  5
@@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
        stw     r7, SS_SPRG+12(r3)
        stw     r8, SS_SDR1(r3)
 
+       mfspr   r4, SPRN_SPRG4
+       mfspr   r5, SPRN_SPRG5
+       mfspr   r6, SPRN_SPRG6
+       mfspr   r7, SPRN_SPRG7
+
+       stw     r4, SS_SPRG+16(r3)
+       stw     r5, SS_SPRG+20(r3)
+       stw     r6, SS_SPRG+24(r3)
+       stw     r7, SS_SPRG+28(r3)
+
        mfspr   r4, SPRN_DBAT0U
        mfspr   r5, SPRN_DBAT0L
        mfspr   r6, SPRN_DBAT1U
@@ -493,6 +503,16 @@ mpc83xx_deep_resume:
        mtspr   SPRN_IBAT7U, r6
        mtspr   SPRN_IBAT7L, r7
 
+       lwz     r4, SS_SPRG+16(r3)
+       lwz     r5, SS_SPRG+20(r3)
+       lwz     r6, SS_SPRG+24(r3)
+       lwz     r7, SS_SPRG+28(r3)
+
+       mtspr   SPRN_SPRG4, r4
+       mtspr   SPRN_SPRG5, r5
+       mtspr   SPRN_SPRG6, r6
+       mtspr   SPRN_SPRG7, r7
+
        lwz     r4, SS_SPRG+0(r3)
        lwz     r5, SS_SPRG+4(r3)
        lwz     r6, SS_SPRG+8(r3)
index b0dac307bebf9cb96d30bfd38c84a38113498548..785e9641220da1dce6ca2fcec5a6fcf9343f65c7 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/udbg.h>
 #include <asm/mpic.h>
 #include <asm/ehv_pic.h>
+#include <asm/swiotlb.h>
 #include <soc/fsl/qe/qe_ic.h>
 
 #include <linux/of_platform.h>
@@ -223,7 +224,3 @@ define_machine(corenet_generic) {
 };
 
 machine_arch_initcall(corenet_generic, corenet_gen_publish_devices);
-
-#ifdef CONFIG_SWIOTLB
-machine_arch_initcall(corenet_generic, swiotlb_setup_bus_notifier);
-#endif
index f29c6f0909f354b089b31f3f4773bbc3fa411e10..c64fa2483ea973ec98ddbcfdeec11d28581ea84b 100644 (file)
@@ -202,8 +202,6 @@ static int __init ge_imp3a_probe(void)
 
 machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(ge_imp3a, swiotlb_setup_bus_notifier);
-
 define_machine(ge_imp3a) {
        .name                   = "GE_IMP3A",
        .probe                  = ge_imp3a_probe,
index 94a7f92c858ffee2c41903f16f82a5cd492522fe..94194bad4954e1636af2322d096d4b5f63053b39 100644 (file)
@@ -57,8 +57,6 @@ static void __init mpc8536_ds_setup_arch(void)
 
 machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(mpc8536_ds, swiotlb_setup_bus_notifier);
-
 /*
  * Called very early, device-tree isn't unflattened
  */
index dc9e035cc637a749061a2b25e1a44d93998f8289..b7e29ce1f266f58a120ef247366ff3ddd1d9a6d0 100644 (file)
@@ -174,10 +174,6 @@ machine_arch_initcall(mpc8544_ds, mpc85xx_common_publish_devices);
 machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices);
 machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(mpc8544_ds, swiotlb_setup_bus_notifier);
-machine_arch_initcall(mpc8572_ds, swiotlb_setup_bus_notifier);
-machine_arch_initcall(p2020_ds, swiotlb_setup_bus_notifier);
-
 /*
  * Called very early, device-tree isn't unflattened
  */
index d7e440e6dba3d3d45615e26f26894bf09bb1615c..80939a425de594d9be79fd9f52efaeb2c9b3282a 100644 (file)
@@ -367,10 +367,6 @@ machine_arch_initcall(mpc8568_mds, mpc85xx_publish_devices);
 machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices);
 machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(mpc8568_mds, swiotlb_setup_bus_notifier);
-machine_arch_initcall(mpc8569_mds, swiotlb_setup_bus_notifier);
-machine_arch_initcall(p1021_mds, swiotlb_setup_bus_notifier);
-
 static void __init mpc85xx_mds_pic_init(void)
 {
        struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN |
index 78d13b364cd631e6c4b2863171fc8382796c4490..33ca373322e18f66a9a252011516e36425d41dea 100644 (file)
@@ -55,7 +55,6 @@ static void __init p1010_rdb_setup_arch(void)
 }
 
 machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices);
-machine_arch_initcall(p1010_rdb, swiotlb_setup_bus_notifier);
 
 /*
  * Called very early, device-tree isn't unflattened
index 9fb57f78cdbe37f08a3d5f2ddef7b06696e04aa2..1f1af0557470feb924448adc98653552d1066209 100644 (file)
@@ -548,8 +548,6 @@ static void __init p1022_ds_setup_arch(void)
 
 machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(p1022_ds, swiotlb_setup_bus_notifier);
-
 /*
  * Called very early, device-tree isn't unflattened
  */
index 276e00ab3dde97a766b6652d7f9dd714a6ea302f..fd9e3e7ef23475f7c9887a4338c981f546f52301 100644 (file)
@@ -128,8 +128,6 @@ static void __init p1022_rdk_setup_arch(void)
 
 machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices);
 
-machine_arch_initcall(p1022_rdk, swiotlb_setup_bus_notifier);
-
 /*
  * Called very early, device-tree isn't unflattened
  */
index 27631c607f3d9a48308593ecfd4fceec873e57b6..c52c8f9e83852bf1c2ed55d3bb017daf0a71b64f 100644 (file)
@@ -22,6 +22,7 @@
 #include <asm/time.h>
 #include <asm/udbg.h>
 #include <asm/mpic.h>
+#include <asm/swiotlb.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 #include "smp.h"
index 17c6cd3d02e6727bb13abb419ef13d72661d749d..775a92353c8329e2f6377937db6037283d99150e 100644 (file)
@@ -121,7 +121,6 @@ static int __init declare_of_platform_devices(void)
        return 0;
 }
 machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices);
-machine_arch_initcall(mpc86xx_hpcn, swiotlb_setup_bus_notifier);
 
 define_machine(mpc86xx_hpcn) {
        .name                   = "MPC86xx HPCN",
index 8c7464c3f27fee422df6790b4c7cff2504361333..842b2c7e156aba4cb2a04d8897fb7aa6128c3b4d 100644 (file)
@@ -153,6 +153,11 @@ config E300C3_CPU
        bool "e300c3 (831x)"
        depends on PPC_BOOK3S_32
 
+config G4_CPU
+       bool "G4 (74xx)"
+       depends on PPC_BOOK3S_32
+       select ALTIVEC
+
 endchoice
 
 config TARGET_CPU_BOOL
@@ -171,6 +176,7 @@ config TARGET_CPU
        default "860" if 860_CPU
        default "e300c2" if E300C2_CPU
        default "e300c3" if E300C3_CPU
+       default "G4" if G4_CPU
 
 config PPC_BOOK3S
        def_bool y
@@ -402,6 +408,9 @@ config NOT_COHERENT_CACHE
        bool
        depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
                GAMECUBE_COMMON || AMIGAONE
+       select ARCH_HAS_DMA_COHERENT_TO_PFN
+       select ARCH_HAS_SYNC_DMA_FOR_DEVICE
+       select ARCH_HAS_SYNC_DMA_FOR_CPU
        default n if PPC_47x
        default y
 
index af2a3c15e0ecc2cf7e6eedafd9a828738b0f353b..54e012e1f720e8d8b2208d81a18d433ccb99a74d 100644 (file)
@@ -544,9 +544,10 @@ static struct cbe_iommu *cell_iommu_for_node(int nid)
 static unsigned long cell_dma_nommu_offset;
 
 static unsigned long dma_iommu_fixed_base;
+static bool cell_iommu_enabled;
 
 /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */
-static int iommu_fixed_is_weak;
+bool iommu_fixed_is_weak;
 
 static struct iommu_table *cell_get_iommu_table(struct device *dev)
 {
@@ -568,102 +569,19 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev)
        return &window->table;
 }
 
-/* A coherent allocation implies strong ordering */
-
-static void *dma_fixed_alloc_coherent(struct device *dev, size_t size,
-                                     dma_addr_t *dma_handle, gfp_t flag,
-                                     unsigned long attrs)
-{
-       if (iommu_fixed_is_weak)
-               return iommu_alloc_coherent(dev, cell_get_iommu_table(dev),
-                                           size, dma_handle,
-                                           device_to_mask(dev), flag,
-                                           dev_to_node(dev));
-       else
-               return dma_nommu_ops.alloc(dev, size, dma_handle, flag,
-                                           attrs);
-}
-
-static void dma_fixed_free_coherent(struct device *dev, size_t size,
-                                   void *vaddr, dma_addr_t dma_handle,
-                                   unsigned long attrs)
-{
-       if (iommu_fixed_is_weak)
-               iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr,
-                                   dma_handle);
-       else
-               dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs);
-}
-
-static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page,
-                                    unsigned long offset, size_t size,
-                                    enum dma_data_direction direction,
-                                    unsigned long attrs)
-{
-       if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
-               return dma_nommu_ops.map_page(dev, page, offset, size,
-                                              direction, attrs);
-       else
-               return iommu_map_page(dev, cell_get_iommu_table(dev), page,
-                                     offset, size, device_to_mask(dev),
-                                     direction, attrs);
-}
-
-static void dma_fixed_unmap_page(struct device *dev, dma_addr_t dma_addr,
-                                size_t size, enum dma_data_direction direction,
-                                unsigned long attrs)
-{
-       if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
-               dma_nommu_ops.unmap_page(dev, dma_addr, size, direction,
-                                         attrs);
-       else
-               iommu_unmap_page(cell_get_iommu_table(dev), dma_addr, size,
-                                direction, attrs);
-}
-
-static int dma_fixed_map_sg(struct device *dev, struct scatterlist *sg,
-                          int nents, enum dma_data_direction direction,
-                          unsigned long attrs)
-{
-       if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
-               return dma_nommu_ops.map_sg(dev, sg, nents, direction, attrs);
-       else
-               return ppc_iommu_map_sg(dev, cell_get_iommu_table(dev), sg,
-                                       nents, device_to_mask(dev),
-                                       direction, attrs);
-}
-
-static void dma_fixed_unmap_sg(struct device *dev, struct scatterlist *sg,
-                              int nents, enum dma_data_direction direction,
-                              unsigned long attrs)
-{
-       if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
-               dma_nommu_ops.unmap_sg(dev, sg, nents, direction, attrs);
-       else
-               ppc_iommu_unmap_sg(cell_get_iommu_table(dev), sg, nents,
-                                  direction, attrs);
-}
-
-static int dma_suported_and_switch(struct device *dev, u64 dma_mask);
-
-static const struct dma_map_ops dma_iommu_fixed_ops = {
-       .alloc          = dma_fixed_alloc_coherent,
-       .free           = dma_fixed_free_coherent,
-       .map_sg         = dma_fixed_map_sg,
-       .unmap_sg       = dma_fixed_unmap_sg,
-       .dma_supported  = dma_suported_and_switch,
-       .map_page       = dma_fixed_map_page,
-       .unmap_page     = dma_fixed_unmap_page,
-};
+static u64 cell_iommu_get_fixed_address(struct device *dev);
 
 static void cell_dma_dev_setup(struct device *dev)
 {
-       if (get_pci_dma_ops() == &dma_iommu_ops)
+       if (cell_iommu_enabled) {
+               u64 addr = cell_iommu_get_fixed_address(dev);
+
+               if (addr != OF_BAD_ADDR)
+                       dev->archdata.dma_offset = addr + dma_iommu_fixed_base;
                set_iommu_table_base(dev, cell_get_iommu_table(dev));
-       else if (get_pci_dma_ops() == &dma_nommu_ops)
-               set_dma_offset(dev, cell_dma_nommu_offset);
-       else
-               BUG();
+       } else {
+               dev->archdata.dma_offset = cell_dma_nommu_offset;
+       }
 }
 
 static void cell_pci_dma_dev_setup(struct pci_dev *dev)
@@ -680,11 +598,9 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action,
        if (action != BUS_NOTIFY_ADD_DEVICE)
                return 0;
 
-       /* We use the PCI DMA ops */
-       dev->dma_ops = get_pci_dma_ops();
-
+       if (cell_iommu_enabled)
+               dev->dma_ops = &dma_iommu_ops;
        cell_dma_dev_setup(dev);
-
        return 0;
 }
 
@@ -809,7 +725,6 @@ static int __init cell_iommu_init_disabled(void)
        unsigned long base = 0, size;
 
        /* When no iommu is present, we use direct DMA ops */
-       set_pci_dma_ops(&dma_nommu_ops);
 
        /* First make sure all IOC translation is turned off */
        cell_disable_iommus();
@@ -894,7 +809,11 @@ static u64 cell_iommu_get_fixed_address(struct device *dev)
        const u32 *ranges = NULL;
        int i, len, best, naddr, nsize, pna, range_size;
 
+       /* We can be called for platform devices that have no of_node */
        np = of_node_get(dev->of_node);
+       if (!np)
+               goto out;
+
        while (1) {
                naddr = of_n_addr_cells(np);
                nsize = of_n_size_cells(np);
@@ -945,27 +864,10 @@ out:
        return dev_addr;
 }
 
-static int dma_suported_and_switch(struct device *dev, u64 dma_mask)
+static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask)
 {
-       if (dma_mask == DMA_BIT_MASK(64) &&
-           cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) {
-               u64 addr = cell_iommu_get_fixed_address(dev) +
-                       dma_iommu_fixed_base;
-               dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
-               dev_dbg(dev, "iommu: fixed addr = %llx\n", addr);
-               set_dma_ops(dev, &dma_iommu_fixed_ops);
-               set_dma_offset(dev, addr);
-               return 1;
-       }
-
-       if (dma_iommu_dma_supported(dev, dma_mask)) {
-               dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
-               set_dma_ops(dev, get_pci_dma_ops());
-               cell_dma_dev_setup(dev);
-               return 1;
-       }
-
-       return 0;
+       return mask == DMA_BIT_MASK(64) &&
+               cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR;
 }
 
 static void insert_16M_pte(unsigned long addr, unsigned long *ptab,
@@ -1119,9 +1021,8 @@ static int __init cell_iommu_fixed_mapping_init(void)
                cell_iommu_setup_window(iommu, np, dbase, dsize, 0);
        }
 
-       dma_iommu_ops.dma_supported = dma_suported_and_switch;
-       set_pci_dma_ops(&dma_iommu_ops);
-
+       cell_pci_controller_ops.iommu_bypass_supported =
+               cell_pci_iommu_bypass_supported;
        return 0;
 }
 
@@ -1142,7 +1043,7 @@ static int __init setup_iommu_fixed(char *str)
        pciep = of_find_node_by_type(NULL, "pcie-endpoint");
 
        if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
-               iommu_fixed_is_weak = DMA_ATTR_WEAK_ORDERING;
+               iommu_fixed_is_weak = true;
 
        of_node_put(pciep);
 
@@ -1150,26 +1051,6 @@ static int __init setup_iommu_fixed(char *str)
 }
 __setup("iommu_fixed=", setup_iommu_fixed);
 
-static u64 cell_dma_get_required_mask(struct device *dev)
-{
-       const struct dma_map_ops *dma_ops;
-
-       if (!dev->dma_mask)
-               return 0;
-
-       if (!iommu_fixed_disabled &&
-                       cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR)
-               return DMA_BIT_MASK(64);
-
-       dma_ops = get_dma_ops(dev);
-       if (dma_ops->get_required_mask)
-               return dma_ops->get_required_mask(dev);
-
-       WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops);
-
-       return DMA_BIT_MASK(64);
-}
-
 static int __init cell_iommu_init(void)
 {
        struct device_node *np;
@@ -1186,10 +1067,9 @@ static int __init cell_iommu_init(void)
 
        /* Setup various callbacks */
        cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
-       ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
 
        if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
-               goto bail;
+               goto done;
 
        /* Create an iommu for each /axon node.  */
        for_each_node_by_name(np, "axon") {
@@ -1206,10 +1086,10 @@ static int __init cell_iommu_init(void)
                        continue;
                cell_iommu_init_one(np, SPIDER_DMA_OFFSET);
        }
-
+ done:
        /* Setup default PCI iommu ops */
        set_pci_dma_ops(&dma_iommu_ops);
-
+       cell_iommu_enabled = true;
  bail:
        /* Register callbacks on OF platform device addition/removal
         * to handle linking them to the right DMA operations
index 125f2a5f02de2e9614fea522b2b73de9bd238b91..b5f35cbe9e2178cb07526dc386659b8e5e7f1f63 100644 (file)
@@ -34,7 +34,7 @@
  */
 
 static void *spu_syscall_table[] = {
-#define __SYSCALL(nr, entry, nargs) entry,
+#define __SYSCALL(nr, entry)   entry,
 #include <asm/syscall_table_spu.h>
 #undef __SYSCALL
 };
index 263413a3482300e0116d8c68f49298408725ba56..b95d6afc39b579eae115718ea3dab4413f5b2544 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
 #include <linux/binfmts.h>
-#include <linux/syscalls.h>
 
 #include <asm/spu.h>
 
index ae8123edddc670ed2b413129ee8a97aade121200..48c2477e7e2a2c462bafc022a6ad53ade37cdde3 100644 (file)
@@ -2338,9 +2338,8 @@ static int spufs_switch_log_open(struct inode *inode, struct file *file)
                goto out;
        }
 
-       ctx->switch_log = kmalloc(sizeof(struct switch_log) +
-               SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry),
-               GFP_KERNEL);
+       ctx->switch_log = kmalloc(struct_size(ctx->switch_log, log,
+                                 SWITCH_LOG_BUFSIZE), GFP_KERNEL);
 
        if (!ctx->switch_log) {
                rc = -ENOMEM;
index ecf703ee3a76d1cdc52113a59dde2fc95133e900..235fe81aa2b1afe5ad6204a70ecd036a8ac662ed 100644 (file)
 static void __iomem *hw_ctrl;
 static void __iomem *hw_gpio;
 
-unsigned long wii_hole_start;
-unsigned long wii_hole_size;
-
-
 static int __init page_aligned(unsigned long x)
 {
        return !(x & (PAGE_SIZE-1));
@@ -69,26 +65,6 @@ void __init wii_memory_fixups(void)
 
        BUG_ON(memblock.memory.cnt != 2);
        BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base));
-
-       /* determine hole */
-       wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE);
-       wii_hole_size = p[1].base - wii_hole_start;
-}
-
-unsigned long __init wii_mmu_mapin_mem2(unsigned long top)
-{
-       unsigned long delta, size, bl;
-       unsigned long max_size = (256<<20);
-
-       /* MEM2 64MB@0x10000000 */
-       delta = wii_hole_start + wii_hole_size;
-       size = top - delta;
-       for (bl = 128<<10; bl < max_size; bl <<= 1) {
-               if (bl * 2 > size)
-                       break;
-       }
-       setbat(4, PAGE_OFFSET+delta, delta, bl, PAGE_KERNEL_X);
-       return delta + bl;
 }
 
 static void __noreturn wii_spin(void)
index f62930f839cad3d7d57a4abf3a545af9696ae483..86368e238f6e6398501aaf6f94e3810afa691db5 100644 (file)
@@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev)
         */
        if (dev->vendor == 0x1959 && dev->device == 0xa007 &&
            !firmware_has_feature(FW_FEATURE_LPAR)) {
-               dev->dev.dma_ops = &dma_nommu_ops;
+               dev->dev.dma_ops = NULL;
                /*
                 * Set the coherent DMA mask to prevent the iommu
                 * being used unnecessarily
index c0532999f8540806b695f730d3583b1e2c45bf46..46dd463faaa78ad2c7c4d498eca3a864dfa43b95 100644 (file)
@@ -411,55 +411,6 @@ out:
        return !!(srr1 & 0x2);
 }
 
-#ifdef CONFIG_PCMCIA
-static int pcmcia_notify(struct notifier_block *nb, unsigned long action,
-                        void *data)
-{
-       struct device *dev = data;
-       struct device *parent;
-       struct pcmcia_device *pdev = to_pcmcia_dev(dev);
-
-       /* We are only intereted in device addition */
-       if (action != BUS_NOTIFY_ADD_DEVICE)
-               return 0;
-
-       parent = pdev->socket->dev.parent;
-
-       /* We know electra_cf devices will always have of_node set, since
-        * electra_cf is an of_platform driver.
-        */
-       if (!parent->of_node)
-               return 0;
-
-       if (!of_device_is_compatible(parent->of_node, "electra-cf"))
-               return 0;
-
-       /* We use the direct ops for localbus */
-       dev->dma_ops = &dma_nommu_ops;
-
-       return 0;
-}
-
-static struct notifier_block pcmcia_notifier = {
-       .notifier_call = pcmcia_notify,
-};
-
-static inline void pasemi_pcmcia_init(void)
-{
-       extern struct bus_type pcmcia_bus_type;
-
-       bus_register_notifier(&pcmcia_bus_type, &pcmcia_notifier);
-}
-
-#else
-
-static inline void pasemi_pcmcia_init(void)
-{
-}
-
-#endif
-
-
 static const struct of_device_id pasemi_bus_ids[] = {
        /* Unfortunately needed for legacy firmwares */
        { .type = "localbus", },
@@ -472,8 +423,6 @@ static const struct of_device_id pasemi_bus_ids[] = {
 
 static int __init pasemi_publish_devices(void)
 {
-       pasemi_pcmcia_init();
-
        /* Publish OF platform devices for SDC and other non-PCI devices */
        of_platform_bus_probe(NULL, pasemi_bus_ids, NULL);
 
index b540ce8eec556649addadbba0b4d58749e132333..da2e99efbd04e623f6761b49c16bcd9da06a88d8 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y                  += setup.o opal-wrappers.o opal.o opal-async.o idle.o
-obj-y                  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y                  += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
+obj-y                  += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y                  += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y                  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y                  += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
@@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE)        += pci-cxl.o
 obj-$(CONFIG_EEH)      += eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
-obj-$(CONFIG_TRACEPOINTS)      += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)     += memtrace.o
index 35f699ebb662189fd574ca23f43c089cf7b92a17..e52f9b06dd9c31fb2c68bf65e43214ee28037624 100644 (file)
@@ -458,7 +458,8 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
+
+void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
 {
        u64 pir = get_hard_smp_processor_id(cpu);
 
@@ -481,20 +482,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 {
        unsigned long srr1;
        u32 idle_states = pnv_get_supported_cpuidle_states();
-       u64 lpcr_val;
-
-       /*
-        * We don't want to take decrementer interrupts while we are
-        * offline, so clear LPCR:PECE1. We keep PECE2 (and
-        * LPCR_PECE_HVEE on P9) enabled as to let IPIs in.
-        *
-        * If the CPU gets woken up by a special wakeup, ensure that
-        * the SLW engine sets LPCR with decrementer bit cleared, else
-        * the CPU will come back to the kernel due to a spurious
-        * wakeup.
-        */
-       lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
-       pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
 
        __ppc64_runlatch_off();
 
@@ -526,16 +513,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 
        __ppc64_runlatch_on();
 
-       /*
-        * Re-enable decrementer interrupts in LPCR.
-        *
-        * Further, we want stop states to be woken up by decrementer
-        * for non-hotplug cases. So program the LPCR via stop api as
-        * well.
-        */
-       lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
-       pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
-
        return srr1;
 }
 #endif
index 3f58c7dbd581e1f018c7e941fc2ea1b649436c07..dc23d9d2a7d9ae60c9c956dfeadd7e480e243891 100644 (file)
  */
 static DEFINE_SPINLOCK(npu_context_lock);
 
-/*
- * Other types of TCE cache invalidation are not functional in the
- * hardware.
- */
 static struct pci_dev *get_pci_dev(struct device_node *dn)
 {
        struct pci_dn *pdn = PCI_DN(dn);
@@ -220,7 +216,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
         * their parent device so drivers shouldn't be doing DMA
         * operations directly on these devices.
         */
-       set_dma_ops(&npe->pdev->dev, NULL);
+       set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
 }
 
 /*
@@ -917,15 +913,6 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
        mmio_invalidate(npu_context, 0, ~0UL);
 }
 
-static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
-                               struct mm_struct *mm,
-                               unsigned long address,
-                               pte_t pte)
-{
-       struct npu_context *npu_context = mn_to_npu_context(mn);
-       mmio_invalidate(npu_context, address, PAGE_SIZE);
-}
-
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
                                        struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
@@ -936,7 +923,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
 
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
        .release = pnv_npu2_mn_release,
-       .change_pte = pnv_npu2_mn_change_pte,
        .invalidate_range = pnv_npu2_mn_invalidate_range,
 };
 
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644 (file)
index 0000000..578757d
--- /dev/null
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/opal-api.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+                              s64 a4, s64 a5, s64 a6, s64 a7,
+                              unsigned long opcode)
+{
+       unsigned int *depth;
+       unsigned long args[8];
+
+       depth = this_cpu_ptr(&opal_trace_depth);
+
+       if (*depth)
+               return;
+
+       args[0] = a0;
+       args[1] = a1;
+       args[2] = a2;
+       args[3] = a3;
+       args[4] = a4;
+       args[5] = a5;
+       args[6] = a6;
+       args[7] = a7;
+
+       (*depth)++;
+       trace_opal_entry(opcode, &args[0]);
+       (*depth)--;
+}
+
+static void __trace_opal_exit(unsigned long opcode, unsigned long retval)
+{
+       unsigned int *depth;
+
+       depth = this_cpu_ptr(&opal_trace_depth);
+
+       if (*depth)
+               return;
+
+       (*depth)++;
+       trace_opal_exit(opcode, retval);
+       (*depth)--;
+}
+
+static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key);
+
+int opal_tracepoint_regfunc(void)
+{
+       static_branch_inc(&opal_tracepoint_key);
+       return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+       static_branch_dec(&opal_tracepoint_key);
+}
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+                            s64 a4, s64 a5, s64 a6, s64 a7,
+                             unsigned long opcode, unsigned long msr)
+{
+       s64 ret;
+
+       __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode);
+       ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+       __trace_opal_exit(opcode, ret);
+
+       return ret;
+}
+
+#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key))
+
+#else /* CONFIG_TRACEPOINTS */
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+                            s64 a4, s64 a5, s64 a6, s64 a7,
+                             unsigned long opcode, unsigned long msr)
+{
+}
+
+#define DO_TRACE false
+#endif /* CONFIG_TRACEPOINTS */
+
+static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+            int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode)
+{
+       unsigned long flags;
+       unsigned long msr = mfmsr();
+       bool mmu = (msr & (MSR_IR|MSR_DR));
+       int64_t ret;
+
+       msr &= ~MSR_EE;
+
+       if (unlikely(!mmu))
+               return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+
+       local_save_flags(flags);
+       hard_irq_disable();
+
+       if (DO_TRACE) {
+               ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+       } else {
+               ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+       }
+
+       local_irq_restore(flags);
+
+       return ret;
+}
+
+#define OPAL_CALL(name, opcode)                                        \
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3,   \
+            int64_t a4, int64_t a5, int64_t a6, int64_t a7)    \
+{                                                              \
+       return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \
+}
+
+OPAL_CALL(opal_invalid_call,                   OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write,                  OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read,                   OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space,     OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read,                       OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write,                      OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down,                 OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot,                     OPAL_CEC_REBOOT);
+OPAL_CALL(opal_cec_reboot2,                    OPAL_CEC_REBOOT2);
+OPAL_CALL(opal_read_nvram,                     OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram,                    OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt,               OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events,                    OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory,         OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory,         OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte,           OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word,      OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word,           OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte,          OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word,     OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word,          OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive,                       OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive,                       OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler,     OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status,          OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear,           OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set,             OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject,                 OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc,                       OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable,            OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window,         OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window,         OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory,       OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe,                     OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv,                  OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve,                    OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable,             OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue,           OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue,           OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe,                        OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source,                        OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32,                     OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64,                     OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu,                      OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status,               OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel,                  OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window,          OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real,     OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset,                      OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data,          OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data,          OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb,                  OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit,                     OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error,              OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status,            OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status,                        OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_get_dpo_status,                 OPAL_GET_DPO_STATUS);
+OPAL_CALL(opal_set_system_attention_led,       OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error,                 OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll,                       OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi,                    OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2,         OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read,                     OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write,                    OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read,                       OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write,                      OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu,                     OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus,                    OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog,                      OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog,                  OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size,                  OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs,            OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog,                     OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash,                 OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash,                   OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash,                   OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,                        OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token,                    OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init,                      OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info,                      OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2,                     OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read,                      OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack,                       OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg,                                OPAL_GET_MSG);
+OPAL_CALL(opal_write_oppanel_async,            OPAL_WRITE_OPPANEL_ASYNC);
+OPAL_CALL(opal_check_completion,               OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification,       OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot,               OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read,                    OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param,                      OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param,                      OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi,                     OPAL_HANDLE_HMI);
+OPAL_CALL(opal_config_cpu_idle_state,          OPAL_CONFIG_CPU_IDLE_STATE);
+OPAL_CALL(opal_slw_set_reg,                    OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region,           OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region,         OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode,           OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write,                      OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read,                       OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send,                      OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv,                      OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request,                    OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read,                     OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write,                    OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase,                    OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg,                                OPAL_PRD_MSG);
+OPAL_CALL(opal_leds_get_ind,                   OPAL_LEDS_GET_INDICATOR);
+OPAL_CALL(opal_leds_set_ind,                   OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush,                  OPAL_CONSOLE_FLUSH);
+OPAL_CALL(opal_get_device_tree,                        OPAL_GET_DEVICE_TREE);
+OPAL_CALL(opal_pci_get_presence_state,         OPAL_PCI_GET_PRESENCE_STATE);
+OPAL_CALL(opal_pci_get_power_state,            OPAL_PCI_GET_POWER_STATE);
+OPAL_CALL(opal_pci_set_power_state,            OPAL_PCI_SET_POWER_STATE);
+OPAL_CALL(opal_int_get_xirr,                   OPAL_INT_GET_XIRR);
+OPAL_CALL(opal_int_set_cppr,                   OPAL_INT_SET_CPPR);
+OPAL_CALL(opal_int_eoi,                                OPAL_INT_EOI);
+OPAL_CALL(opal_int_set_mfrr,                   OPAL_INT_SET_MFRR);
+OPAL_CALL(opal_pci_tce_kill,                   OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr,                  OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset,                     OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info,              OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config,            OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config,            OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info,            OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info,            OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page,               OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block,            OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block,             OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq,              OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq,                  OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info,               OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info,               OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync,                      OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump,                      OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,            OPAL_SIGNAL_SYSTEM_RESET);
+OPAL_CALL(opal_npu_init_context,               OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context,            OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar,                   OPAL_NPU_MAP_LPAR);
+OPAL_CALL(opal_imc_counters_init,              OPAL_IMC_COUNTERS_INIT);
+OPAL_CALL(opal_imc_counters_start,             OPAL_IMC_COUNTERS_START);
+OPAL_CALL(opal_imc_counters_stop,              OPAL_IMC_COUNTERS_STOP);
+OPAL_CALL(opal_pci_set_p2p,                    OPAL_PCI_SET_P2P);
+OPAL_CALL(opal_get_powercap,                   OPAL_GET_POWERCAP);
+OPAL_CALL(opal_set_powercap,                   OPAL_SET_POWERCAP);
+OPAL_CALL(opal_get_power_shift_ratio,          OPAL_GET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_set_power_shift_ratio,          OPAL_SET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_sensor_group_clear,             OPAL_SENSOR_GROUP_CLEAR);
+OPAL_CALL(opal_quiesce,                                OPAL_QUIESCE);
+OPAL_CALL(opal_npu_spa_setup,                  OPAL_NPU_SPA_SETUP);
+OPAL_CALL(opal_npu_spa_clear_cache,            OPAL_NPU_SPA_CLEAR_CACHE);
+OPAL_CALL(opal_npu_tl_set,                     OPAL_NPU_TL_SET);
+OPAL_CALL(opal_pci_get_pbcq_tunnel_bar,                OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,                OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_sensor_read_u64,                        OPAL_SENSOR_READ_U64);
+OPAL_CALL(opal_sensor_group_enable,            OPAL_SENSOR_GROUP_ENABLE);
+OPAL_CALL(opal_nx_coproc_init,                 OPAL_NX_COPROC_INIT);
index acd3206dfae3477452f11c4a96dfc1638cafae00..06628c71cef6996119b90a59fb68d129072a61f5 100644 (file)
@@ -98,7 +98,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
 }
 
 static struct bin_attribute opal_msglog_attr = {
-       .attr = {.name = "msglog", .mode = 0444},
+       .attr = {.name = "msglog", .mode = 0400},
        .read = opal_msglog_read
 };
 
index f4875fe3f8ff288827ca142a0a1b47c5da9bfa2c..7d2052d8af9d772eb649ececc395c7a23ff5e61f 100644 (file)
 #include <asm/asm-compat.h>
 #include <asm/feature-fixups.h>
 
-       .section        ".text"
-
-#ifdef CONFIG_TRACEPOINTS
-#ifdef CONFIG_JUMP_LABEL
-#define OPAL_BRANCH(LABEL)                                     \
-       ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
-#else
-
-       .section        ".toc","aw"
-
-       .globl opal_tracepoint_refcount
-opal_tracepoint_refcount:
-       .8byte  0
-
-       .section        ".text"
-
-/*
- * We branch around this in early init by using an unconditional cpu
- * feature.
- */
-#define OPAL_BRANCH(LABEL)                                     \
-BEGIN_FTR_SECTION;                                             \
-       b       1f;                                             \
-END_FTR_SECTION(0, 1);                                         \
-       ld      r11,opal_tracepoint_refcount@toc(r2);           \
-       cmpdi   r11,0;                                          \
-       bne-    LABEL;                                          \
-1:
-
-#endif
-
-#else
-#define OPAL_BRANCH(LABEL)
-#endif
+       .section ".text"
 
 /*
- * DO_OPAL_CALL assumes:
- * r0  = opal call token
- * r12 = msr
- * LR has been saved
+ * r3-r10              - OPAL call arguments
+ * STK_PARAM(R11)      - OPAL opcode
+ * STK_PARAM(R12)      - MSR to restore
  */
-#define DO_OPAL_CALL()                 \
-       mfcr    r11;                    \
-       stw     r11,8(r1);              \
-       li      r11,0;                  \
-       ori     r11,r11,MSR_EE;         \
-       std     r12,PACASAVEDMSR(r13);  \
-       andc    r12,r12,r11;            \
-       mtmsrd  r12,1;                  \
-       LOAD_REG_ADDR(r11,opal_return); \
-       mtlr    r11;                    \
-       li      r11,MSR_DR|MSR_IR|MSR_LE;\
-       andc    r12,r12,r11;            \
-       mtspr   SPRN_HSRR1,r12;         \
-       LOAD_REG_ADDR(r11,opal);        \
-       ld      r12,8(r11);             \
-       ld      r2,0(r11);              \
-       mtspr   SPRN_HSRR0,r12;         \
+_GLOBAL_TOC(__opal_call)
+       mflr    r0
+       std     r0,PPC_LR_STKOFF(r1)
+       ld      r12,STK_PARAM(R12)(r1)
+       li      r0,MSR_IR|MSR_DR|MSR_LE
+       andc    r12,r12,r0
+       LOAD_REG_ADDR(r11, opal_return)
+       mtlr    r11
+       LOAD_REG_ADDR(r11, opal)
+       ld      r2,0(r11)
+       ld      r11,8(r11)
+       mtspr   SPRN_HSRR0,r11
+       mtspr   SPRN_HSRR1,r12
+       /* set token to r0 */
+       ld      r0,STK_PARAM(R11)(r1)
        hrfid
-
-#define OPAL_CALL(name, token)         \
- _GLOBAL_TOC(name);                    \
-       mfmsr   r12;                    \
-       mflr    r0;                     \
-       andi.   r11,r12,MSR_IR|MSR_DR;  \
-       std     r0,PPC_LR_STKOFF(r1);   \
-       li      r0,token;               \
-       beq     opal_real_call;         \
-       OPAL_BRANCH(opal_tracepoint_entry) \
-       DO_OPAL_CALL()
-
-
 opal_return:
        /*
-        * Fixup endian on OPAL return... we should be able to simplify
-        * this by instead converting the below trampoline to a set of
-        * bytes (always BE) since MSR:LE will end up fixed up as a side
-        * effect of the rfid.
+        * Restore MSR on OPAL return. The MSR is set to big-endian.
         */
-       FIXUP_ENDIAN_HV
-       ld      r2,PACATOC(r13);
-       lwz     r4,8(r1);
-       ld      r5,PPC_LR_STKOFF(r1);
-       ld      r6,PACASAVEDMSR(r13);
-       mtcr    r4;
-       mtspr   SPRN_HSRR0,r5;
-       mtspr   SPRN_HSRR1,r6;
-       hrfid
-
-opal_real_call:
-       mfcr    r11
-       stw     r11,8(r1)
-       /* Set opal return address */
-       LOAD_REG_ADDR(r11, opal_return_realmode)
-       mtlr    r11
-       li      r11,MSR_LE
-       andc    r12,r12,r11
-       mtspr   SPRN_HSRR1,r12
-       LOAD_REG_ADDR(r11,opal)
-       ld      r12,8(r11)
-       ld      r2,0(r11)
-       mtspr   SPRN_HSRR0,r12
-       hrfid
-
-opal_return_realmode:
-       FIXUP_ENDIAN_HV
-       ld      r2,PACATOC(r13);
-       lwz     r11,8(r1);
-       ld      r12,PPC_LR_STKOFF(r1)
-       mtcr    r11;
-       mtlr    r12
-       blr
-
-#ifdef CONFIG_TRACEPOINTS
-opal_tracepoint_entry:
-       stdu    r1,-STACKFRAMESIZE(r1)
-       std     r0,STK_REG(R23)(r1)
-       std     r3,STK_REG(R24)(r1)
-       std     r4,STK_REG(R25)(r1)
-       std     r5,STK_REG(R26)(r1)
-       std     r6,STK_REG(R27)(r1)
-       std     r7,STK_REG(R28)(r1)
-       std     r8,STK_REG(R29)(r1)
-       std     r9,STK_REG(R30)(r1)
-       std     r10,STK_REG(R31)(r1)
-       mr      r3,r0
-       addi    r4,r1,STK_REG(R24)
-       bl      __trace_opal_entry
-       ld      r0,STK_REG(R23)(r1)
-       ld      r3,STK_REG(R24)(r1)
-       ld      r4,STK_REG(R25)(r1)
-       ld      r5,STK_REG(R26)(r1)
-       ld      r6,STK_REG(R27)(r1)
-       ld      r7,STK_REG(R28)(r1)
-       ld      r8,STK_REG(R29)(r1)
-       ld      r9,STK_REG(R30)(r1)
-       ld      r10,STK_REG(R31)(r1)
-
-       /* setup LR so we return via tracepoint_return */
-       LOAD_REG_ADDR(r11,opal_tracepoint_return)
-       std     r11,16(r1)
-
-       mfmsr   r12
-       DO_OPAL_CALL()
-
-opal_tracepoint_return:
-       std     r3,STK_REG(R31)(r1)
-       mr      r4,r3
-       ld      r3,STK_REG(R23)(r1)
-       bl      __trace_opal_exit
-       ld      r3,STK_REG(R31)(r1)
-       addi    r1,r1,STACKFRAMESIZE
-       ld      r0,16(r1)
+#ifdef __BIG_ENDIAN__
+       ld      r11,STK_PARAM(R12)(r1)
+       mtmsrd  r11
+#else
+       /* Endian can only be switched with rfi, must byte reverse MSR load */
+       .short 0x4039    /* li r10,STK_PARAM(R12)               */
+       .byte (STK_PARAM(R12) >> 8) & 0xff
+       .byte STK_PARAM(R12) & 0xff
+
+       .long 0x280c6a7d /* ldbrx r11,r10,r1                    */
+       .long 0x05009f42 /* bcl 20,31,$+4                       */
+       .long 0xa602487d /* mflr r10                            */
+       .long 0x14004a39 /* addi r10,r10,20                     */
+       .long 0xa64b5a7d /* mthsrr0 r10                         */
+       .long 0xa64b7b7d /* mthsrr1 r11                         */
+       .long 0x2402004c /* hrfid                               */
+#endif
+       ld      r2,PACATOC(r13)
+       ld      r0,PPC_LR_STKOFF(r1)
        mtlr    r0
        blr
-#endif
-
-
-OPAL_CALL(opal_invalid_call,                   OPAL_INVALID_CALL);
-OPAL_CALL(opal_console_write,                  OPAL_CONSOLE_WRITE);
-OPAL_CALL(opal_console_read,                   OPAL_CONSOLE_READ);
-OPAL_CALL(opal_console_write_buffer_space,     OPAL_CONSOLE_WRITE_BUFFER_SPACE);
-OPAL_CALL(opal_rtc_read,                       OPAL_RTC_READ);
-OPAL_CALL(opal_rtc_write,                      OPAL_RTC_WRITE);
-OPAL_CALL(opal_cec_power_down,                 OPAL_CEC_POWER_DOWN);
-OPAL_CALL(opal_cec_reboot,                     OPAL_CEC_REBOOT);
-OPAL_CALL(opal_cec_reboot2,                    OPAL_CEC_REBOOT2);
-OPAL_CALL(opal_read_nvram,                     OPAL_READ_NVRAM);
-OPAL_CALL(opal_write_nvram,                    OPAL_WRITE_NVRAM);
-OPAL_CALL(opal_handle_interrupt,               OPAL_HANDLE_INTERRUPT);
-OPAL_CALL(opal_poll_events,                    OPAL_POLL_EVENTS);
-OPAL_CALL(opal_pci_set_hub_tce_memory,         OPAL_PCI_SET_HUB_TCE_MEMORY);
-OPAL_CALL(opal_pci_set_phb_tce_memory,         OPAL_PCI_SET_PHB_TCE_MEMORY);
-OPAL_CALL(opal_pci_config_read_byte,           OPAL_PCI_CONFIG_READ_BYTE);
-OPAL_CALL(opal_pci_config_read_half_word,      OPAL_PCI_CONFIG_READ_HALF_WORD);
-OPAL_CALL(opal_pci_config_read_word,           OPAL_PCI_CONFIG_READ_WORD);
-OPAL_CALL(opal_pci_config_write_byte,          OPAL_PCI_CONFIG_WRITE_BYTE);
-OPAL_CALL(opal_pci_config_write_half_word,     OPAL_PCI_CONFIG_WRITE_HALF_WORD);
-OPAL_CALL(opal_pci_config_write_word,          OPAL_PCI_CONFIG_WRITE_WORD);
-OPAL_CALL(opal_set_xive,                       OPAL_SET_XIVE);
-OPAL_CALL(opal_get_xive,                       OPAL_GET_XIVE);
-OPAL_CALL(opal_register_exception_handler,     OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
-OPAL_CALL(opal_pci_eeh_freeze_status,          OPAL_PCI_EEH_FREEZE_STATUS);
-OPAL_CALL(opal_pci_eeh_freeze_clear,           OPAL_PCI_EEH_FREEZE_CLEAR);
-OPAL_CALL(opal_pci_eeh_freeze_set,             OPAL_PCI_EEH_FREEZE_SET);
-OPAL_CALL(opal_pci_err_inject,                 OPAL_PCI_ERR_INJECT);
-OPAL_CALL(opal_pci_shpc,                       OPAL_PCI_SHPC);
-OPAL_CALL(opal_pci_phb_mmio_enable,            OPAL_PCI_PHB_MMIO_ENABLE);
-OPAL_CALL(opal_pci_set_phb_mem_window,         OPAL_PCI_SET_PHB_MEM_WINDOW);
-OPAL_CALL(opal_pci_map_pe_mmio_window,         OPAL_PCI_MAP_PE_MMIO_WINDOW);
-OPAL_CALL(opal_pci_set_phb_table_memory,       OPAL_PCI_SET_PHB_TABLE_MEMORY);
-OPAL_CALL(opal_pci_set_pe,                     OPAL_PCI_SET_PE);
-OPAL_CALL(opal_pci_set_peltv,                  OPAL_PCI_SET_PELTV);
-OPAL_CALL(opal_pci_set_mve,                    OPAL_PCI_SET_MVE);
-OPAL_CALL(opal_pci_set_mve_enable,             OPAL_PCI_SET_MVE_ENABLE);
-OPAL_CALL(opal_pci_get_xive_reissue,           OPAL_PCI_GET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_reissue,           OPAL_PCI_SET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_pe,                        OPAL_PCI_SET_XIVE_PE);
-OPAL_CALL(opal_get_xive_source,                        OPAL_GET_XIVE_SOURCE);
-OPAL_CALL(opal_get_msi_32,                     OPAL_GET_MSI_32);
-OPAL_CALL(opal_get_msi_64,                     OPAL_GET_MSI_64);
-OPAL_CALL(opal_start_cpu,                      OPAL_START_CPU);
-OPAL_CALL(opal_query_cpu_status,               OPAL_QUERY_CPU_STATUS);
-OPAL_CALL(opal_write_oppanel,                  OPAL_WRITE_OPPANEL);
-OPAL_CALL(opal_pci_map_pe_dma_window,          OPAL_PCI_MAP_PE_DMA_WINDOW);
-OPAL_CALL(opal_pci_map_pe_dma_window_real,     OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
-OPAL_CALL(opal_pci_reset,                      OPAL_PCI_RESET);
-OPAL_CALL(opal_pci_get_hub_diag_data,          OPAL_PCI_GET_HUB_DIAG_DATA);
-OPAL_CALL(opal_pci_get_phb_diag_data,          OPAL_PCI_GET_PHB_DIAG_DATA);
-OPAL_CALL(opal_pci_fence_phb,                  OPAL_PCI_FENCE_PHB);
-OPAL_CALL(opal_pci_reinit,                     OPAL_PCI_REINIT);
-OPAL_CALL(opal_pci_mask_pe_error,              OPAL_PCI_MASK_PE_ERROR);
-OPAL_CALL(opal_set_slot_led_status,            OPAL_SET_SLOT_LED_STATUS);
-OPAL_CALL(opal_get_epow_status,                        OPAL_GET_EPOW_STATUS);
-OPAL_CALL(opal_get_dpo_status,                 OPAL_GET_DPO_STATUS);
-OPAL_CALL(opal_set_system_attention_led,       OPAL_SET_SYSTEM_ATTENTION_LED);
-OPAL_CALL(opal_pci_next_error,                 OPAL_PCI_NEXT_ERROR);
-OPAL_CALL(opal_pci_poll,                       OPAL_PCI_POLL);
-OPAL_CALL(opal_pci_msi_eoi,                    OPAL_PCI_MSI_EOI);
-OPAL_CALL(opal_pci_get_phb_diag_data2,         OPAL_PCI_GET_PHB_DIAG_DATA2);
-OPAL_CALL(opal_xscom_read,                     OPAL_XSCOM_READ);
-OPAL_CALL(opal_xscom_write,                    OPAL_XSCOM_WRITE);
-OPAL_CALL(opal_lpc_read,                       OPAL_LPC_READ);
-OPAL_CALL(opal_lpc_write,                      OPAL_LPC_WRITE);
-OPAL_CALL(opal_return_cpu,                     OPAL_RETURN_CPU);
-OPAL_CALL(opal_reinit_cpus,                    OPAL_REINIT_CPUS);
-OPAL_CALL(opal_read_elog,                      OPAL_ELOG_READ);
-OPAL_CALL(opal_send_ack_elog,                  OPAL_ELOG_ACK);
-OPAL_CALL(opal_get_elog_size,                  OPAL_ELOG_SIZE);
-OPAL_CALL(opal_resend_pending_logs,            OPAL_ELOG_RESEND);
-OPAL_CALL(opal_write_elog,                     OPAL_ELOG_WRITE);
-OPAL_CALL(opal_validate_flash,                 OPAL_FLASH_VALIDATE);
-OPAL_CALL(opal_manage_flash,                   OPAL_FLASH_MANAGE);
-OPAL_CALL(opal_update_flash,                   OPAL_FLASH_UPDATE);
-OPAL_CALL(opal_resync_timebase,                        OPAL_RESYNC_TIMEBASE);
-OPAL_CALL(opal_check_token,                    OPAL_CHECK_TOKEN);
-OPAL_CALL(opal_dump_init,                      OPAL_DUMP_INIT);
-OPAL_CALL(opal_dump_info,                      OPAL_DUMP_INFO);
-OPAL_CALL(opal_dump_info2,                     OPAL_DUMP_INFO2);
-OPAL_CALL(opal_dump_read,                      OPAL_DUMP_READ);
-OPAL_CALL(opal_dump_ack,                       OPAL_DUMP_ACK);
-OPAL_CALL(opal_get_msg,                                OPAL_GET_MSG);
-OPAL_CALL(opal_write_oppanel_async,            OPAL_WRITE_OPPANEL_ASYNC);
-OPAL_CALL(opal_check_completion,               OPAL_CHECK_ASYNC_COMPLETION);
-OPAL_CALL(opal_dump_resend_notification,       OPAL_DUMP_RESEND);
-OPAL_CALL(opal_sync_host_reboot,               OPAL_SYNC_HOST_REBOOT);
-OPAL_CALL(opal_sensor_read,                    OPAL_SENSOR_READ);
-OPAL_CALL(opal_get_param,                      OPAL_GET_PARAM);
-OPAL_CALL(opal_set_param,                      OPAL_SET_PARAM);
-OPAL_CALL(opal_handle_hmi,                     OPAL_HANDLE_HMI);
-OPAL_CALL(opal_config_cpu_idle_state,          OPAL_CONFIG_CPU_IDLE_STATE);
-OPAL_CALL(opal_slw_set_reg,                    OPAL_SLW_SET_REG);
-OPAL_CALL(opal_register_dump_region,           OPAL_REGISTER_DUMP_REGION);
-OPAL_CALL(opal_unregister_dump_region,         OPAL_UNREGISTER_DUMP_REGION);
-OPAL_CALL(opal_pci_set_phb_cxl_mode,           OPAL_PCI_SET_PHB_CAPI_MODE);
-OPAL_CALL(opal_tpo_write,                      OPAL_WRITE_TPO);
-OPAL_CALL(opal_tpo_read,                       OPAL_READ_TPO);
-OPAL_CALL(opal_ipmi_send,                      OPAL_IPMI_SEND);
-OPAL_CALL(opal_ipmi_recv,                      OPAL_IPMI_RECV);
-OPAL_CALL(opal_i2c_request,                    OPAL_I2C_REQUEST);
-OPAL_CALL(opal_flash_read,                     OPAL_FLASH_READ);
-OPAL_CALL(opal_flash_write,                    OPAL_FLASH_WRITE);
-OPAL_CALL(opal_flash_erase,                    OPAL_FLASH_ERASE);
-OPAL_CALL(opal_prd_msg,                                OPAL_PRD_MSG);
-OPAL_CALL(opal_leds_get_ind,                   OPAL_LEDS_GET_INDICATOR);
-OPAL_CALL(opal_leds_set_ind,                   OPAL_LEDS_SET_INDICATOR);
-OPAL_CALL(opal_console_flush,                  OPAL_CONSOLE_FLUSH);
-OPAL_CALL(opal_get_device_tree,                        OPAL_GET_DEVICE_TREE);
-OPAL_CALL(opal_pci_get_presence_state,         OPAL_PCI_GET_PRESENCE_STATE);
-OPAL_CALL(opal_pci_get_power_state,            OPAL_PCI_GET_POWER_STATE);
-OPAL_CALL(opal_pci_set_power_state,            OPAL_PCI_SET_POWER_STATE);
-OPAL_CALL(opal_int_get_xirr,                   OPAL_INT_GET_XIRR);
-OPAL_CALL(opal_int_set_cppr,                   OPAL_INT_SET_CPPR);
-OPAL_CALL(opal_int_eoi,                                OPAL_INT_EOI);
-OPAL_CALL(opal_int_set_mfrr,                   OPAL_INT_SET_MFRR);
-OPAL_CALL(opal_pci_tce_kill,                   OPAL_PCI_TCE_KILL);
-OPAL_CALL(opal_nmmu_set_ptcr,                  OPAL_NMMU_SET_PTCR);
-OPAL_CALL(opal_xive_reset,                     OPAL_XIVE_RESET);
-OPAL_CALL(opal_xive_get_irq_info,              OPAL_XIVE_GET_IRQ_INFO);
-OPAL_CALL(opal_xive_get_irq_config,            OPAL_XIVE_GET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_set_irq_config,            OPAL_XIVE_SET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_get_queue_info,            OPAL_XIVE_GET_QUEUE_INFO);
-OPAL_CALL(opal_xive_set_queue_info,            OPAL_XIVE_SET_QUEUE_INFO);
-OPAL_CALL(opal_xive_donate_page,               OPAL_XIVE_DONATE_PAGE);
-OPAL_CALL(opal_xive_alloc_vp_block,            OPAL_XIVE_ALLOCATE_VP_BLOCK);
-OPAL_CALL(opal_xive_free_vp_block,             OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq,              OPAL_XIVE_ALLOCATE_IRQ);
-OPAL_CALL(opal_xive_free_irq,                  OPAL_XIVE_FREE_IRQ);
-OPAL_CALL(opal_xive_get_vp_info,               OPAL_XIVE_GET_VP_INFO);
-OPAL_CALL(opal_xive_set_vp_info,               OPAL_XIVE_SET_VP_INFO);
-OPAL_CALL(opal_xive_sync,                      OPAL_XIVE_SYNC);
-OPAL_CALL(opal_xive_dump,                      OPAL_XIVE_DUMP);
-OPAL_CALL(opal_signal_system_reset,            OPAL_SIGNAL_SYSTEM_RESET);
-OPAL_CALL(opal_npu_init_context,               OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context,            OPAL_NPU_DESTROY_CONTEXT);
-OPAL_CALL(opal_npu_map_lpar,                   OPAL_NPU_MAP_LPAR);
-OPAL_CALL(opal_imc_counters_init,              OPAL_IMC_COUNTERS_INIT);
-OPAL_CALL(opal_imc_counters_start,             OPAL_IMC_COUNTERS_START);
-OPAL_CALL(opal_imc_counters_stop,              OPAL_IMC_COUNTERS_STOP);
-OPAL_CALL(opal_pci_set_p2p,                    OPAL_PCI_SET_P2P);
-OPAL_CALL(opal_get_powercap,                   OPAL_GET_POWERCAP);
-OPAL_CALL(opal_set_powercap,                   OPAL_SET_POWERCAP);
-OPAL_CALL(opal_get_power_shift_ratio,          OPAL_GET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_set_power_shift_ratio,          OPAL_SET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_sensor_group_clear,             OPAL_SENSOR_GROUP_CLEAR);
-OPAL_CALL(opal_quiesce,                                OPAL_QUIESCE);
-OPAL_CALL(opal_npu_spa_setup,                  OPAL_NPU_SPA_SETUP);
-OPAL_CALL(opal_npu_spa_clear_cache,            OPAL_NPU_SPA_CLEAR_CACHE);
-OPAL_CALL(opal_npu_tl_set,                     OPAL_NPU_TL_SET);
-OPAL_CALL(opal_pci_get_pbcq_tunnel_bar,                OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
-OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,                OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
-OPAL_CALL(opal_sensor_read_u64,                        OPAL_SENSOR_READ_U64);
-OPAL_CALL(opal_sensor_group_enable,            OPAL_SENSOR_GROUP_ENABLE);
-OPAL_CALL(opal_nx_coproc_init,                 OPAL_NX_COPROC_INIT);
index 8e157f9f1ff24999f54e5111fa5da6da926adc82..727a7de086351a53738f4dd16779fecb7db64006 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/memblock.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/printk.h>
 #include <linux/kmsg_dump.h>
 #include <linux/console.h>
 #include <linux/sched/debug.h>
@@ -586,7 +585,7 @@ int opal_machine_check(struct pt_regs *regs)
                       evt.version);
                return 0;
        }
-       machine_check_print_event_info(&evt, user_mode(regs));
+       machine_check_print_event_info(&evt, user_mode(regs), false);
 
        if (opal_recover_mce(regs, &evt))
                return 1;
index 697449afb3f77f905c7c9b2155dd0b70650cf2b4..e28f03e1eb5eb52cd565c78b125537b99efad50c 100644 (file)
@@ -313,7 +313,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
                        page_shift);
        tbl->it_level_size = 1ULL << (level_shift - 3);
        tbl->it_indirect_levels = levels - 1;
-       tbl->it_allocated_size = total_allocated;
        tbl->it_userspace = uas;
        tbl->it_nid = nid;
 
index 145373f0e5dc082ecd34f062143b88c82c22b24c..fa6af52b5219f309a3451589a341fecb3c83edf5 100644 (file)
@@ -1748,7 +1748,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
        pe = &phb->ioda.pe_array[pdn->pe_number];
        WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-       set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+       pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
        set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
        /*
         * Note: iommu_add_device() will fail here as
@@ -1758,31 +1758,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
         */
 }
 
-static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
-{
-       unsigned short vendor = 0;
-       struct pci_dev *pdev;
-
-       if (pe->device_count == 1)
-               return true;
-
-       /* pe->pdev should be set if it's a single device, pe->pbus if not */
-       if (!pe->pbus)
-               return true;
-
-       list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
-               if (!vendor) {
-                       vendor = pdev->vendor;
-                       continue;
-               }
-
-               if (pdev->vendor != vendor)
-                       return false;
-       }
-
-       return true;
-}
-
 /*
  * Reconfigure TVE#0 to be usable as 64-bit DMA space.
  *
@@ -1852,88 +1827,45 @@ err:
        return -EIO;
 }
 
-static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
+               u64 dma_mask)
 {
        struct pci_controller *hose = pci_bus_to_host(pdev->bus);
        struct pnv_phb *phb = hose->private_data;
        struct pci_dn *pdn = pci_get_pdn(pdev);
        struct pnv_ioda_pe *pe;
-       uint64_t top;
-       bool bypass = false;
-       s64 rc;
 
        if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
                return -ENODEV;
 
        pe = &phb->ioda.pe_array[pdn->pe_number];
        if (pe->tce_bypass_enabled) {
-               top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
-               bypass = (dma_mask >= top);
+               u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+               if (dma_mask >= top)
+                       return true;
        }
 
-       if (bypass) {
-               dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
-               set_dma_ops(&pdev->dev, &dma_nommu_ops);
-       } else {
-               /*
-                * If the device can't set the TCE bypass bit but still wants
-                * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-                * bypass the 32-bit region and be usable for 64-bit DMAs.
-                * The device needs to be able to address all of this space.
-                */
-               if (dma_mask >> 32 &&
-                   dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-                   pnv_pci_ioda_pe_single_vendor(pe) &&
-                   phb->model == PNV_PHB_MODEL_PHB3) {
-                       /* Configure the bypass mode */
-                       rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-                       if (rc)
-                               return rc;
-                       /* 4GB offset bypasses 32-bit space */
-                       set_dma_offset(&pdev->dev, (1ULL << 32));
-                       set_dma_ops(&pdev->dev, &dma_nommu_ops);
-               } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
-                       /*
-                        * Fail the request if a DMA mask between 32 and 64 bits
-                        * was requested but couldn't be fulfilled. Ideally we
-                        * would do this for 64-bits but historically we have
-                        * always fallen back to 32-bits.
-                        */
-                       return -ENOMEM;
-               } else {
-                       dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
-                       set_dma_ops(&pdev->dev, &dma_iommu_ops);
-               }
+       /*
+        * If the device can't set the TCE bypass bit but still wants
+        * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+        * bypass the 32-bit region and be usable for 64-bit DMAs.
+        * The device needs to be able to address all of this space.
+        */
+       if (dma_mask >> 32 &&
+           dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+           /* pe->pdev should be set if it's a single device, pe->pbus if not */
+           (pe->device_count == 1 || !pe->pbus) &&
+           phb->model == PNV_PHB_MODEL_PHB3) {
+               /* Configure the bypass mode */
+               s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+               if (rc)
+                       return rc;
+               /* 4GB offset bypasses 32-bit space */
+               pdev->dev.archdata.dma_offset = (1ULL << 32);
+               return true;
        }
-       *pdev->dev.dma_mask = dma_mask;
 
-       /* Update peer npu devices */
-       pnv_npu_try_dma_set_bypass(pdev, bypass);
-
-       return 0;
-}
-
-static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
-{
-       struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       struct pci_dn *pdn = pci_get_pdn(pdev);
-       struct pnv_ioda_pe *pe;
-       u64 end, mask;
-
-       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-               return 0;
-
-       pe = &phb->ioda.pe_array[pdn->pe_number];
-       if (!pe->tce_bypass_enabled)
-               return __dma_get_required_mask(&pdev->dev);
-
-
-       end = pe->tce_bypass_base + memblock_end_of_DRAM();
-       mask = 1ULL << (fls64(end) - 1);
-       mask += mask - 1;
-
-       return mask;
+       return false;
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
@@ -1942,7 +1874,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 
        list_for_each_entry(dev, &bus->devices, bus_list) {
                set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
-               set_dma_offset(&dev->dev, pe->tce_bypass_base);
+               dev->dev.archdata.dma_offset = pe->tce_bypass_base;
 
                if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
                        pnv_ioda_setup_bus_dma(pe, dev->subordinate);
@@ -2594,8 +2526,13 @@ static long pnv_pci_ioda2_create_table_userspace(
                int num, __u32 page_shift, __u64 window_size, __u32 levels,
                struct iommu_table **ptbl)
 {
-       return pnv_pci_ioda2_create_table(table_group,
+       long ret = pnv_pci_ioda2_create_table(table_group,
                        num, page_shift, window_size, levels, true, ptbl);
+
+       if (!ret)
+               (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
+                               page_shift, window_size, levels);
+       return ret;
 }
 
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
@@ -3661,6 +3598,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .dma_dev_setup          = pnv_pci_dma_dev_setup,
        .dma_bus_setup          = pnv_pci_dma_bus_setup,
+       .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
        .setup_msi_irqs         = pnv_setup_msi_irqs,
        .teardown_msi_irqs      = pnv_teardown_msi_irqs,
        .enable_device_hook     = pnv_pci_enable_device_hook,
@@ -3668,19 +3606,9 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .window_alignment       = pnv_pci_window_alignment,
        .setup_bridge           = pnv_pci_setup_bridge,
        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
-       .dma_set_mask           = pnv_pci_ioda_dma_set_mask,
-       .dma_get_required_mask  = pnv_pci_ioda_dma_get_required_mask,
        .shutdown               = pnv_pci_ioda_shutdown,
 };
 
-static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
-{
-       dev_err_once(&npdev->dev,
-                       "%s operation unsupported for NVLink devices\n",
-                       __func__);
-       return -EPERM;
-}
-
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
        .dma_dev_setup          = pnv_pci_dma_dev_setup,
        .setup_msi_irqs         = pnv_setup_msi_irqs,
@@ -3688,7 +3616,6 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
        .enable_device_hook     = pnv_pci_enable_device_hook,
        .window_alignment       = pnv_pci_window_alignment,
        .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
-       .dma_set_mask           = pnv_npu_dma_set_mask,
        .shutdown               = pnv_pci_ioda_shutdown,
        .disable_device         = pnv_npu_disable_device,
 };
@@ -3946,9 +3873,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
         * shutdown PCI devices correctly. We already got IODA table
         * cleaned out. So we have to issue PHB reset to stop all PCI
         * transactions from previous kernel. The ppc_pci_reset_phbs
-        * kernel parameter will force this reset too.
+        * kernel parameter will force this reset too. Additionally,
+        * if the IODA reset above failed then use a bigger hammer.
+        * This can happen if we get a PHB fatal error in very early
+        * boot.
         */
-       if (is_kdump_kernel() || pci_reset_phbs) {
+       if (is_kdump_kernel() || pci_reset_phbs || rc) {
                pr_info("  Issue PHB reset ...\n");
                pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
                pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
index 0d354e19ef926e7cc9b6ca6b1170403b4c952fdd..db09c7022635bca8fce8fb6225554a3d76d8d200 100644 (file)
@@ -39,6 +39,7 @@
 #include <asm/cpuidle.h>
 #include <asm/kexec.h>
 #include <asm/reg.h>
+#include <asm/powernv.h>
 
 #include "powernv.h"
 
@@ -153,6 +154,7 @@ static void pnv_smp_cpu_kill_self(void)
 {
        unsigned int cpu;
        unsigned long srr1, wmask;
+       u64 lpcr_val;
 
        /* Standard hot unplug procedure */
        /*
@@ -174,6 +176,19 @@ static void pnv_smp_cpu_kill_self(void)
        if (cpu_has_feature(CPU_FTR_ARCH_207S))
                wmask = SRR1_WAKEMASK_P8;
 
+       /*
+        * We don't want to take decrementer interrupts while we are
+        * offline, so clear LPCR:PECE1. We keep PECE2 (and
+        * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in.
+        *
+        * If the CPU gets woken up by a special wakeup, ensure that
+        * the SLW engine sets LPCR with decrementer bit cleared, else
+        * the CPU will come back to the kernel due to a spurious
+        * wakeup.
+        */
+       lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+       pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+
        while (!generic_check_cpu_restart(cpu)) {
                /*
                 * Clear IPI flag, since we don't handle IPIs while
@@ -246,6 +261,16 @@ static void pnv_smp_cpu_kill_self(void)
 
        }
 
+       /*
+        * Re-enable decrementer interrupts in LPCR.
+        *
+        * Further, we want stop states to be woken up by decrementer
+        * for non-hotplug cases. So program the LPCR via stop api as
+        * well.
+        */
+       lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
+       pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+
        DBG("CPU%d coming online...\n", cpu);
 }
 
index e7075aaff1bb62094f8d8f1df706c7d5157496e3..59587b75493db977142b33502cf02a58218a1009 100644 (file)
@@ -354,9 +354,7 @@ static int ps3_setup_storage_dev(const struct ps3_repository_device *repo,
                 repo->dev_index, repo->dev_type, port, blk_size, num_blocks,
                 num_regions);
 
-       p = kzalloc(sizeof(struct ps3_storage_device) +
-                   num_regions * sizeof(struct ps3_storage_region),
-                   GFP_KERNEL);
+       p = kzalloc(struct_size(p, regions, num_regions), GFP_KERNEL);
        if (!p) {
                result = -ENOMEM;
                goto fail_malloc;
index f5387ad822798125402d76a530a6d7cc2c1733f5..4d65c538002063045950ed03a2fb3c20e572edd4 100644 (file)
@@ -205,11 +205,11 @@ static const struct os_area_db_id os_area_db_id_rtc_diff = {
  *  3) The number of seconds from 1970 to 2000.
  */
 
-struct saved_params {
+static struct saved_params {
        unsigned int valid;
        s64 rtc_diff;
        unsigned int av_multi_out;
-} static saved_params;
+} saved_params;
 
 static struct property property_rtc_diff = {
        .name = "linux,rtc_diff",
index 5cc35d6b94b6c8627527d2232ebf879f73b8e0d6..7c227e784247c0df26da431e512a685ca1cc9ecf 100644 (file)
@@ -37,12 +37,12 @@ static struct device ps3_system_bus = {
 };
 
 /* FIXME: need device usage counters! */
-struct {
+static struct {
        struct mutex mutex;
        int sb_11; /* usb 0 */
        int sb_12; /* usb 0 */
        int gpu;
-} static usage_hack;
+} usage_hack;
 
 static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id,
                         u64 dev_id)
index 2f8e62163602bbaa125a7b1f03d06bce9cee05e6..97feb6e79f1a707f9002371d9d4e98bb95589f07 100644 (file)
@@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
        return rc;
 }
 
+int dlpar_cpu_readd(int cpu)
+{
+       struct device_node *dn;
+       struct device *dev;
+       u32 drc_index;
+       int rc;
+
+       dev = get_cpu_device(cpu);
+       dn = dev->of_node;
+
+       rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
+
+       rc = dlpar_cpu_remove_by_index(drc_index);
+       if (!rc)
+               rc = dlpar_cpu_add(drc_index);
+
+       return rc;
+}
+
 int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
 {
        u32 count, drc_index;
index 8fc8fe0b9848536e0ee3334bc9f81efa27848695..36eb1ddbac69546355c0d7b7ebb04709cb4f0f45 100644 (file)
@@ -978,7 +978,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by dma_set_mask
+ * returns the dma offset for use by the direct mapped DMA code.
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
@@ -1198,87 +1198,37 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
        iommu_add_device(pci->table_group, &dev->dev);
 }
 
-static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
 {
-       bool ddw_enabled = false;
-       struct device_node *pdn, *dn;
-       struct pci_dev *pdev;
+       struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
        const __be32 *dma_window = NULL;
-       u64 dma_offset;
-
-       if (!dev->dma_mask)
-               return -EIO;
-
-       if (!dev_is_pci(dev))
-               goto check_mask;
-
-       pdev = to_pci_dev(dev);
 
        /* only attempt to use a new window if 64-bit DMA is requested */
-       if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
-               dn = pci_device_to_OF_node(pdev);
-               dev_dbg(dev, "node is %pOF\n", dn);
+       if (dma_mask < DMA_BIT_MASK(64))
+               return false;
 
-               /*
-                * the device tree might contain the dma-window properties
-                * per-device and not necessarily for the bus. So we need to
-                * search upwards in the tree until we either hit a dma-window
-                * property, OR find a parent with a table already allocated.
-                */
-               for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
-                               pdn = pdn->parent) {
-                       dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
-                       if (dma_window)
-                               break;
-               }
-               if (pdn && PCI_DN(pdn)) {
-                       dma_offset = enable_ddw(pdev, pdn);
-                       if (dma_offset != 0) {
-                               dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
-                               set_dma_offset(dev, dma_offset);
-                               set_dma_ops(dev, &dma_nommu_ops);
-                               ddw_enabled = true;
-                       }
-               }
-       }
+       dev_dbg(&pdev->dev, "node is %pOF\n", dn);
 
-       /* fall back on iommu ops */
-       if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
-               dev_info(dev, "Restoring 32-bit DMA via iommu\n");
-               set_dma_ops(dev, &dma_iommu_ops);
+       /*
+        * the device tree might contain the dma-window properties
+        * per-device and not necessarily for the bus. So we need to
+        * search upwards in the tree until we either hit a dma-window
+        * property, OR find a parent with a table already allocated.
+        */
+       for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+                       pdn = pdn->parent) {
+               dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+               if (dma_window)
+                       break;
        }
 
-check_mask:
-       if (!dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
-       return 0;
-}
-
-static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
-{
-       if (!dev->dma_mask)
-               return 0;
-
-       if (!disable_ddw && dev_is_pci(dev)) {
-               struct pci_dev *pdev = to_pci_dev(dev);
-               struct device_node *dn;
-
-               dn = pci_device_to_OF_node(pdev);
-
-               /* search upwards for ibm,dma-window */
-               for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
-                               dn = dn->parent)
-                       if (of_get_property(dn, "ibm,dma-window", NULL))
-                               break;
-               /* if there is a ibm,ddw-applicable property require 64 bits */
-               if (dn && PCI_DN(dn) &&
-                               of_get_property(dn, "ibm,ddw-applicable", NULL))
-                       return DMA_BIT_MASK(64);
+       if (pdn && PCI_DN(pdn)) {
+               pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
+               if (pdev->dev.archdata.dma_offset)
+                       return true;
        }
 
-       return dma_iommu_ops.get_required_mask(dev);
+       return false;
 }
 
 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
@@ -1373,8 +1323,9 @@ void iommu_init_early_pSeries(void)
        if (firmware_has_feature(FW_FEATURE_LPAR)) {
                pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
                pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
-               ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
-               ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
+               if (!disable_ddw)
+                       pseries_pci_controller_ops.iommu_bypass_supported =
+                               iommu_bypass_supported_pSeriesLP;
        } else {
                pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
                pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
index 794487313cc8d252af91e3ed113e51c624cff3d9..e73c7e30efe638ecc697f34b89ca7996912847f9 100644 (file)
@@ -475,6 +475,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
                splpar_dispatch_data(m);
 
                seq_printf(m, "purr=%ld\n", get_purr());
+               seq_printf(m, "tbr=%ld\n", mftb());
        } else {                /* non SPLPAR case */
 
                seq_printf(m, "system_active_processors=%d\n",
index 1fad4649735bde3b1a7480e90c935b00e41c9f5a..141795275ccb19bf66da2dc9bcff88684d135df5 100644 (file)
@@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
                return NULL;
        }
 
-       ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
+       ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
+                                   dma_handle, dev->coherent_dma_mask, flag,
+                                   dev_to_node(dev));
        if (unlikely(ret == NULL)) {
                vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
                atomic_inc(&viodev->cmo.allocs_failed);
@@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
 {
        struct vio_dev *viodev = to_vio_dev(dev);
 
-       dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
-
+       iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
        vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
 }
 
@@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
                                          unsigned long attrs)
 {
        struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
+       struct iommu_table *tbl = get_iommu_table_base(dev);
        dma_addr_t ret = DMA_MAPPING_ERROR;
 
-       tbl = get_iommu_table_base(dev);
-       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return ret;
-       }
-
-       ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
-       if (unlikely(dma_mapping_error(dev, ret))) {
-               vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
-               atomic_inc(&viodev->cmo.allocs_failed);
-       }
-
+       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))))
+               goto out_fail;
+       ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev),
+                       direction, attrs);
+       if (unlikely(ret == DMA_MAPPING_ERROR))
+               goto out_deallocate;
        return ret;
+
+out_deallocate:
+       vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+out_fail:
+       atomic_inc(&viodev->cmo.allocs_failed);
+       return DMA_MAPPING_ERROR;
 }
 
 static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
@@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
                                     unsigned long attrs)
 {
        struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
-
-       tbl = get_iommu_table_base(dev);
-       dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
+       struct iommu_table *tbl = get_iommu_table_base(dev);
 
+       iommu_unmap_page(tbl, dma_handle, size, direction, attrs);
        vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 }
 
@@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                                 unsigned long attrs)
 {
        struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
+       struct iommu_table *tbl = get_iommu_table_base(dev);
        struct scatterlist *sgl;
        int ret, count;
        size_t alloc_size = 0;
 
-       tbl = get_iommu_table_base(dev);
        for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
-       if (vio_cmo_alloc(viodev, alloc_size)) {
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return 0;
-       }
-
-       ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
-
-       if (unlikely(!ret)) {
-               vio_cmo_dealloc(viodev, alloc_size);
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return ret;
-       }
+       if (vio_cmo_alloc(viodev, alloc_size))
+               goto out_fail;
+       ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev),
+                       direction, attrs);
+       if (unlikely(!ret))
+               goto out_deallocate;
 
        for_each_sg(sglist, sgl, ret, count)
                alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
        if (alloc_size)
                vio_cmo_dealloc(viodev, alloc_size);
-
        return ret;
+
+out_deallocate:
+       vio_cmo_dealloc(viodev, alloc_size);
+out_fail:
+       atomic_inc(&viodev->cmo.allocs_failed);
+       return 0;
 }
 
 static void vio_dma_iommu_unmap_sg(struct device *dev,
@@ -591,40 +588,27 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
                unsigned long attrs)
 {
        struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
+       struct iommu_table *tbl = get_iommu_table_base(dev);
        struct scatterlist *sgl;
        size_t alloc_size = 0;
        int count;
 
-       tbl = get_iommu_table_base(dev);
        for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 
-       dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
-
+       ppc_iommu_unmap_sg(tbl, sglist, nelems, direction, attrs);
        vio_cmo_dealloc(viodev, alloc_size);
 }
 
-static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
-{
-        return dma_iommu_ops.dma_supported(dev, mask);
-}
-
-static u64 vio_dma_get_required_mask(struct device *dev)
-{
-        return dma_iommu_ops.get_required_mask(dev);
-}
-
 static const struct dma_map_ops vio_dma_mapping_ops = {
        .alloc             = vio_dma_iommu_alloc_coherent,
        .free              = vio_dma_iommu_free_coherent,
-       .mmap              = dma_nommu_mmap_coherent,
        .map_sg            = vio_dma_iommu_map_sg,
        .unmap_sg          = vio_dma_iommu_unmap_sg,
        .map_page          = vio_dma_iommu_map_page,
        .unmap_page        = vio_dma_iommu_unmap_page,
-       .dma_supported     = vio_dma_iommu_dma_supported,
-       .get_required_mask = vio_dma_get_required_mask,
+       .dma_supported     = dma_iommu_dma_supported,
+       .get_required_mask = dma_iommu_get_required_mask,
 };
 
 /**
@@ -1715,3 +1699,10 @@ int vio_disable_interrupts(struct vio_dev *dev)
 }
 EXPORT_SYMBOL(vio_disable_interrupts);
 #endif /* CONFIG_PPC_PSERIES */
+
+static int __init vio_init(void)
+{
+       dma_debug_add_bus(&vio_bus_type);
+       return 0;
+}
+fs_initcall(vio_init);
index cf48e9cb257596698a65f88877e85a3bb33c1f8c..6c4aec25c4ba868ca05dbcf5d2848b9761ab4004 100644 (file)
@@ -29,10 +29,9 @@ _GLOBAL(mpc6xx_enter_standby)
        ori     r5, r5, ret_from_standby@l
        mtlr    r5
 
-       CURRENT_THREAD_INFO(r5, r1)
-       lwz     r6, TI_LOCAL_FLAGS(r5)
+       lwz     r6, TI_LOCAL_FLAGS(r2)
        ori     r6, r6, _TLF_SLEEPING
-       stw     r6, TI_LOCAL_FLAGS(r5)
+       stw     r6, TI_LOCAL_FLAGS(r2)
 
        mfmsr   r5
        ori     r5, r5, MSR_EE
index 25bc25fe0d93f3f41354f7ea876aa698c3055e3b..fc5c5c23303edde535e8aa3d75f4dfcf55d88445 100644 (file)
@@ -363,13 +363,6 @@ static void iommu_table_dart_setup(void)
        set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
 }
 
-static void pci_dma_dev_setup_dart(struct pci_dev *dev)
-{
-       if (dart_is_u4)
-               set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE);
-       set_iommu_table_base(&dev->dev, &iommu_table_dart);
-}
-
 static void pci_dma_bus_setup_dart(struct pci_bus *bus)
 {
        if (!iommu_table_dart_inited) {
@@ -393,27 +386,18 @@ static bool dart_device_on_pcie(struct device *dev)
        return false;
 }
 
-static int dart_dma_set_mask(struct device *dev, u64 dma_mask)
+static void pci_dma_dev_setup_dart(struct pci_dev *dev)
 {
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       /* U4 supports a DART bypass, we use it for 64-bit capable
-        * devices to improve performances. However, that only works
-        * for devices connected to U4 own PCIe interface, not bridged
-        * through hypertransport. We need the device to support at
-        * least 40 bits of addresses.
-        */
-       if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) {
-               dev_info(dev, "Using 64-bit DMA iommu bypass\n");
-               set_dma_ops(dev, &dma_nommu_ops);
-       } else {
-               dev_info(dev, "Using 32-bit DMA via iommu\n");
-               set_dma_ops(dev, &dma_iommu_ops);
-       }
+       if (dart_is_u4 && dart_device_on_pcie(&dev->dev))
+               dev->dev.archdata.dma_offset = DART_U4_BYPASS_BASE;
+       set_iommu_table_base(&dev->dev, &iommu_table_dart);
+}
 
-       *dev->dma_mask = dma_mask;
-       return 0;
+static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask)
+{
+       return dart_is_u4 &&
+               dart_device_on_pcie(&dev->dev) &&
+               mask >= DMA_BIT_MASK(40);
 }
 
 void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
@@ -431,26 +415,20 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
 
        /* Initialize the DART HW */
        if (dart_init(dn) != 0)
-               goto bail;
-
-       /* Setup bypass if supported */
-       if (dart_is_u4)
-               ppc_md.dma_set_mask = dart_dma_set_mask;
+               return;
 
+       /*
+        * U4 supports a DART bypass, we use it for 64-bit capable devices to
+        * improve performance.  However, that only works for devices connected
+        * to the U4 own PCIe interface, not bridged through hypertransport.
+        * We need the device to support at least 40 bits of addresses.
+        */
        controller_ops->dma_dev_setup = pci_dma_dev_setup_dart;
        controller_ops->dma_bus_setup = pci_dma_bus_setup_dart;
+       controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart;
 
        /* Setup pci_dma ops */
        set_pci_dma_ops(&dma_iommu_ops);
-       return;
-
- bail:
-       /* If init failed, use direct iommu and null setup functions */
-       controller_ops->dma_dev_setup = NULL;
-       controller_ops->dma_bus_setup = NULL;
-
-       /* Setup pci_dma ops */
-       set_pci_dma_ops(&dma_nommu_ops);
 }
 
 #ifdef CONFIG_PM
index 918be816b0977c5feba2a9ef79b57111de1901c3..f49aec251a5a0e7753dfa395f76f1e83b3797047 100644 (file)
@@ -40,6 +40,7 @@
 #include <asm/mpc85xx.h>
 #include <asm/disassemble.h>
 #include <asm/ppc-opcode.h>
+#include <asm/swiotlb.h>
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
@@ -114,33 +115,33 @@ static struct pci_ops fsl_indirect_pcie_ops =
 static u64 pci64_dma_offset;
 
 #ifdef CONFIG_SWIOTLB
+static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
+{
+       struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+
+       pdev->dev.bus_dma_mask =
+               hose->dma_window_base_cur + hose->dma_window_size;
+}
+
 static void setup_swiotlb_ops(struct pci_controller *hose)
 {
-       if (ppc_swiotlb_enable) {
+       if (ppc_swiotlb_enable)
                hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb;
-               set_pci_dma_ops(&powerpc_swiotlb_dma_ops);
-       }
 }
 #else
 static inline void setup_swiotlb_ops(struct pci_controller *hose) {}
 #endif
 
-static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
+static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 {
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
        /*
         * Fix up PCI devices that are able to DMA to the large inbound
         * mapping that allows addressing any RAM address from across PCI.
         */
        if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
-               set_dma_ops(dev, &dma_nommu_ops);
-               set_dma_offset(dev, pci64_dma_offset);
+               dev->bus_dma_mask = 0;
+               dev->archdata.dma_offset = pci64_dma_offset;
        }
-
-       *dev->dma_mask = dma_mask;
-       return 0;
 }
 
 static int setup_one_atmu(struct ccsr_pci __iomem *pci,
index 8030a0f55e966bd5d1dab67eb12e268fc5963e7e..fd129c8ecceb85d27911806a7f560eb7d22ab155 100644 (file)
@@ -771,21 +771,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags)
        return ipic;
 }
 
-void ipic_set_highest_priority(unsigned int virq)
-{
-       struct ipic *ipic = ipic_from_irq(virq);
-       unsigned int src = virq_to_hw(virq);
-       u32 temp;
-
-       temp = ipic_read(ipic->regs, IPIC_SICFR);
-
-       /* clear and set HPI */
-       temp &= 0x7f000000;
-       temp |= (src & 0x7f) << 24;
-
-       ipic_write(ipic->regs, IPIC_SICFR, temp);
-}
-
 void ipic_set_default_priority(void)
 {
        ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT);
@@ -796,26 +781,6 @@ void ipic_set_default_priority(void)
        ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT);
 }
 
-void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq)
-{
-       struct ipic *ipic = primary_ipic;
-       u32 temp;
-
-       temp = ipic_read(ipic->regs, IPIC_SERMR);
-       temp |= (1 << (31 - mcp_irq));
-       ipic_write(ipic->regs, IPIC_SERMR, temp);
-}
-
-void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq)
-{
-       struct ipic *ipic = primary_ipic;
-       u32 temp;
-
-       temp = ipic_read(ipic->regs, IPIC_SERMR);
-       temp &= (1 << (31 - mcp_irq));
-       ipic_write(ipic->regs, IPIC_SERMR, temp);
-}
-
 u32 ipic_get_mcp_status(void)
 {
        return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0;
index 1fd0717ade02dd2049800ef0c0bc51cbbac28a71..1f1af12f23e22325855b4ec2fd8af99efd969c91 100644 (file)
@@ -51,7 +51,7 @@ phys_addr_t get_csrbase(void)
                const void *prop = of_get_property(tsi, "reg", &size);
                tsi108_csr_base = of_translate_address(tsi, prop);
                of_node_put(tsi);
-       };
+       }
        return tsi108_csr_base;
 }
 
index 94a69a62f5db868a3c97367b64c3e2a3ae9754d3..70a8f9e31a2dc79a7b1db4c0dd63d1ea46793019 100644 (file)
@@ -442,7 +442,7 @@ static void xive_dec_target_count(int cpu)
        struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
        struct xive_q *q = &xc->queue[xive_irq_priority];
 
-       if (unlikely(WARN_ON(cpu < 0 || !xc))) {
+       if (WARN_ON(cpu < 0 || !xc)) {
                pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
                return;
        }
index 878f9c1d36150c80a021413f23ac36577ef881c0..3050f9323254f0086ce911d93b045311864cbd13 100644 (file)
@@ -5,6 +5,7 @@
 subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header)
 
 GCOV_PROFILE := n
+KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 
 # Disable ftrace for the entire directory
index 9deea5ee13f652cd6c8a8b7f1f7d58a9229f31ae..27f1e64150360e1cb030d2a3840e835a91da6219 100644 (file)
@@ -158,7 +158,7 @@ int print_insn_powerpc (unsigned long insn, unsigned long memaddr)
     dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7
                | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM
                | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2
-               | PPC_OPCODE_VSX | PPC_OPCODE_VSX3),
+               | PPC_OPCODE_VSX | PPC_OPCODE_VSX3);
 
   /* Get the major opcode of the insn.  */
   opcode = NULL;
index 757b8499aba2ebb52ba16850c846b726f40455f0..a0f44f9923608929d352596e43af8311698c3467 100644 (file)
@@ -2997,7 +2997,7 @@ static void show_task(struct task_struct *tsk)
        printf("%px %016lx %6d %6d %c %2d %s\n", tsk,
                tsk->thread.ksp,
                tsk->pid, rcu_dereference(tsk->parent)->pid,
-               state, task_thread_info(tsk)->cpu,
+               state, task_cpu(tsk),
                tsk->comm);
 }
 
index bd149905a5b5ba2bca9cde32e1fbb8ec0e1282da..b41311f6a94f49a8de9bbd2f9343268f6ce60630 100644 (file)
@@ -90,14 +90,14 @@ config GENERIC_CSUM
 config GENERIC_HWEIGHT
        def_bool y
 
+config FIX_EARLYCON_MEM
+       def_bool y
+
 config PGTABLE_LEVELS
        int
        default 3 if 64BIT
        default 2
 
-config HAVE_KPROBES
-       def_bool n
-
 menu "Platform type"
 
 choice
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
new file mode 100644 (file)
index 0000000..57afe60
--- /dev/null
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef _ASM_RISCV_FIXMAP_H
+#define _ASM_RISCV_FIXMAP_H
+
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+/*
+ * Here we define all the compile-time 'special' virtual addresses.
+ * The point is to have a constant address at compile time, but to
+ * set the physical address only in the boot process.
+ *
+ * These 'compile-time allocated' memory buffers are page-sized. Use
+ * set_fixmap(idx,phys) to associate physical memory with fixmap indices.
+ */
+enum fixed_addresses {
+       FIX_HOLE,
+       FIX_EARLYCON_MEM_BASE,
+       __end_of_fixed_addresses
+};
+
+#define FIXADDR_SIZE           (__end_of_fixed_addresses * PAGE_SIZE)
+#define FIXADDR_TOP            (PAGE_OFFSET)
+#define FIXADDR_START          (FIXADDR_TOP - FIXADDR_SIZE)
+
+#define FIXMAP_PAGE_IO         PAGE_KERNEL
+
+#define __early_set_fixmap     __set_fixmap
+
+#define __late_set_fixmap      __set_fixmap
+#define __late_clear_fixmap(idx) __set_fixmap((idx), 0, FIXMAP_PAGE_CLEAR)
+
+extern void __set_fixmap(enum fixed_addresses idx,
+                        phys_addr_t phys, pgprot_t prot);
+
+#include <asm-generic/fixmap.h>
+
+#endif /* _ASM_RISCV_FIXMAP_H */
index a8179a8c1491c28a6ad3eeedda34a122bc00a2c3..1141364d990e1d6a01303993a38622f2f685c00d 100644 (file)
@@ -404,6 +404,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 #define kern_addr_valid(addr)   (1) /* FIXME */
 #endif
 
+extern void setup_bootmem(void);
 extern void paging_init(void);
 
 static inline void pgtable_cache_init(void)
index 41aa73b476f44c70c85f469fc49b6c1bd8f9b76a..636a934f013a07f9ee5a49e5ed87fc9bfa3b0cfb 100644 (file)
 #include <linux/thread_info.h>
 
 #define INVALID_HARTID ULONG_MAX
+
+struct seq_file;
+extern unsigned long boot_cpu_hartid;
+
+#ifdef CONFIG_SMP
 /*
  * Mapping between linux logical cpu index and hartid.
  */
 extern unsigned long __cpuid_to_hartid_map[NR_CPUS];
 #define cpuid_to_hartid_map(cpu)    __cpuid_to_hartid_map[cpu]
 
-struct seq_file;
-
-#ifdef CONFIG_SMP
-
 /* print IPI stats */
 void show_ipi_stats(struct seq_file *p, int prec);
 
@@ -58,7 +59,14 @@ static inline void show_ipi_stats(struct seq_file *p, int prec)
 
 static inline int riscv_hartid_to_cpuid(int hartid)
 {
-       return 0;
+       if (hartid == boot_cpu_hartid)
+               return 0;
+
+       return -1;
+}
+static inline unsigned long cpuid_to_hartid_map(int cpu)
+{
+       return boot_cpu_hartid;
 }
 
 static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
index f8fa2c63aa897f943f65fa1e233694978528f749..cf2fca12414a4501a35037ffa020fc6a46e47277 100644 (file)
 #include <asm/smp.h>
 
 /*
- * Returns the hart ID of the given device tree node, or -1 if the device tree
- * node isn't a RISC-V hart.
+ * Returns the hart ID of the given device tree node, or -ENODEV if the node
+ * isn't an enabled and valid RISC-V hart node.
  */
 int riscv_of_processor_hartid(struct device_node *node)
 {
-       const char *isa, *status;
+       const char *isa;
        u32 hart;
 
        if (!of_device_is_compatible(node, "riscv")) {
                pr_warn("Found incompatible CPU\n");
-               return -(ENODEV);
+               return -ENODEV;
        }
 
        if (of_property_read_u32(node, "reg", &hart)) {
                pr_warn("Found CPU without hart ID\n");
-               return -(ENODEV);
-       }
-       if (hart >= NR_CPUS) {
-               pr_info("Found hart ID %d, which is above NR_CPUs.  Disabling this hart\n", hart);
-               return -(ENODEV);
+               return -ENODEV;
        }
 
-       if (of_property_read_string(node, "status", &status)) {
-               pr_warn("CPU with hartid=%d has no \"status\" property\n", hart);
-               return -(ENODEV);
-       }
-       if (strcmp(status, "okay")) {
-               pr_info("CPU with hartid=%d has a non-okay status of \"%s\"\n", hart, status);
-               return -(ENODEV);
+       if (!of_device_is_available(node)) {
+               pr_info("CPU with hartid=%d is not available\n", hart);
+               return -ENODEV;
        }
 
        if (of_property_read_string(node, "riscv,isa", &isa)) {
                pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart);
-               return -(ENODEV);
+               return -ENODEV;
        }
        if (isa[0] != 'r' || isa[1] != 'v') {
                pr_warn("CPU with hartid=%d has an invalid ISA of \"%s\"\n", hart, isa);
-               return -(ENODEV);
+               return -ENODEV;
        }
 
        return hart;
@@ -106,7 +98,7 @@ static void print_isa(struct seq_file *f, const char *orig_isa)
         * a bit of info describing what went wrong.
         */
        if (isa[0] != '\0')
-               pr_info("unsupported ISA \"%s\" in device tree", orig_isa);
+               pr_info("unsupported ISA \"%s\" in device tree\n", orig_isa);
 }
 
 static void print_mmu(struct seq_file *f, const char *mmu_type)
index a6e369edbbd742663dd98d1a465327f8adfbc659..bc29b010b722f62d18fac0b7bd79c5adb77d4d4f 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/of.h>
 #include <asm/processor.h>
 #include <asm/hwcap.h>
+#include <asm/smp.h>
 
 unsigned long elf_hwcap __read_mostly;
 #ifdef CONFIG_FPU
@@ -28,7 +29,7 @@ bool has_fpu __read_mostly;
 
 void riscv_fill_hwcap(void)
 {
-       struct device_node *node = NULL;
+       struct device_node *node;
        const char *isa;
        size_t i;
        static unsigned long isa2hwcap[256] = {0};
@@ -42,36 +43,39 @@ void riscv_fill_hwcap(void)
 
        elf_hwcap = 0;
 
-       /*
-        * We don't support running Linux on hertergenous ISA systems.  For
-        * now, we just check the ISA of the first "okay" processor.
-        */
-       while ((node = of_find_node_by_type(node, "cpu")))
-               if (riscv_of_processor_hartid(node) >= 0)
-                       break;
-       if (!node) {
-               pr_warning("Unable to find \"cpu\" devicetree entry");
-               return;
-       }
+       for_each_of_cpu_node(node) {
+               unsigned long this_hwcap = 0;
 
-       if (of_property_read_string(node, "riscv,isa", &isa)) {
-               pr_warning("Unable to find \"riscv,isa\" devicetree entry");
-               of_node_put(node);
-               return;
-       }
-       of_node_put(node);
+               if (riscv_of_processor_hartid(node) < 0)
+                       continue;
+
+               if (of_property_read_string(node, "riscv,isa", &isa)) {
+                       pr_warn("Unable to find \"riscv,isa\" devicetree entry\n");
+                       continue;
+               }
 
-       for (i = 0; i < strlen(isa); ++i)
-               elf_hwcap |= isa2hwcap[(unsigned char)(isa[i])];
+               for (i = 0; i < strlen(isa); ++i)
+                       this_hwcap |= isa2hwcap[(unsigned char)(isa[i])];
+
+               /*
+                * All "okay" hart should have same isa. Set HWCAP based on
+                * common capabilities of every "okay" hart, in case they don't
+                * have.
+                */
+               if (elf_hwcap)
+                       elf_hwcap &= this_hwcap;
+               else
+                       elf_hwcap = this_hwcap;
+       }
 
        /* We don't support systems with F but without D, so mask those out
         * here. */
        if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) {
-               pr_info("This kernel does not support systems with F but not D");
+               pr_info("This kernel does not support systems with F but not D\n");
                elf_hwcap &= ~COMPAT_HWCAP_ISA_F;
        }
 
-       pr_info("elf_hwcap is 0x%lx", elf_hwcap);
+       pr_info("elf_hwcap is 0x%lx\n", elf_hwcap);
 
 #ifdef CONFIG_FPU
        if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D))
index a840b7d074f7d3028bb04f7d10e849db68c08c06..b94d8db5ddccbcac050da196c59c926a738894dc 100644 (file)
@@ -32,7 +32,7 @@ static int ftrace_check_current_call(unsigned long hook_pos,
         * return must be -EINVAL on failed comparison
         */
        if (memcmp(expected, replaced, sizeof(replaced))) {
-               pr_err("%p: expected (%08x %08x) but get (%08x %08x)",
+               pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n",
                       (void *)hook_pos, expected[0], expected[1], replaced[0],
                       replaced[1]);
                return -EINVAL;
index 77564310235f4d0a545ddde1478e5adfb7b2d08a..ecb654f6a79ef105931a51950d520c1af845edff 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/memblock.h>
 #include <linux/sched.h>
-#include <linux/initrd.h>
 #include <linux/console.h>
 #include <linux/screen_info.h>
 #include <linux/of_fdt.h>
@@ -61,95 +60,9 @@ EXPORT_SYMBOL(empty_zero_page);
 atomic_t hart_lottery;
 unsigned long boot_cpu_hartid;
 
-unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
-       [0 ... NR_CPUS-1] = INVALID_HARTID
-};
-
-void __init smp_setup_processor_id(void)
-{
-       cpuid_to_hartid_map(0) = boot_cpu_hartid;
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-static void __init setup_initrd(void)
-{
-       unsigned long size;
-
-       if (initrd_start >= initrd_end) {
-               printk(KERN_INFO "initrd not found or empty");
-               goto disable;
-       }
-       if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
-               printk(KERN_ERR "initrd extends beyond end of memory");
-               goto disable;
-       }
-
-       size =  initrd_end - initrd_start;
-       memblock_reserve(__pa(initrd_start), size);
-       initrd_below_start_ok = 1;
-
-       printk(KERN_INFO "Initial ramdisk at: 0x%p (%lu bytes)\n",
-               (void *)(initrd_start), size);
-       return;
-disable:
-       pr_cont(" - disabling initrd\n");
-       initrd_start = 0;
-       initrd_end = 0;
-}
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
-pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-
-#ifndef __PAGETABLE_PMD_FOLDED
-#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
-pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
-pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-#endif
-
-asmlinkage void __init setup_vm(void)
-{
-       extern char _start;
-       uintptr_t i;
-       uintptr_t pa = (uintptr_t) &_start;
-       pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
-
-       va_pa_offset = PAGE_OFFSET - pa;
-       pfn_base = PFN_DOWN(pa);
-
-       /* Sanity check alignment and size */
-       BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
-       BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
-
-#ifndef __PAGETABLE_PMD_FOLDED
-       trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
-               pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd),
-                       __pgprot(_PAGE_TABLE));
-       trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot);
-
-       for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
-               size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
-               swapper_pg_dir[o] =
-                       pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
-                               __pgprot(_PAGE_TABLE));
-       }
-       for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
-               swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
-#else
-       trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
-               pfn_pgd(PFN_DOWN(pa), prot);
-
-       for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
-               size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
-               swapper_pg_dir[o] =
-                       pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
-       }
-#endif
-}
-
 void __init parse_dtb(unsigned int hartid, void *dtb)
 {
-       if (!early_init_dt_scan(__va(dtb)))
+       if (early_init_dt_scan(__va(dtb)))
                return;
 
        pr_err("No DTB passed to the kernel\n");
@@ -159,60 +72,17 @@ void __init parse_dtb(unsigned int hartid, void *dtb)
 #endif
 }
 
-static void __init setup_bootmem(void)
-{
-       struct memblock_region *reg;
-       phys_addr_t mem_size = 0;
-
-       /* Find the memory region containing the kernel */
-       for_each_memblock(memory, reg) {
-               phys_addr_t vmlinux_end = __pa(_end);
-               phys_addr_t end = reg->base + reg->size;
-
-               if (reg->base <= vmlinux_end && vmlinux_end <= end) {
-                       /*
-                        * Reserve from the start of the region to the end of
-                        * the kernel
-                        */
-                       memblock_reserve(reg->base, vmlinux_end - reg->base);
-                       mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
-               }
-       }
-       BUG_ON(mem_size == 0);
-
-       set_max_mapnr(PFN_DOWN(mem_size));
-       max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
-
-#ifdef CONFIG_BLK_DEV_INITRD
-       setup_initrd();
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-       early_init_fdt_reserve_self();
-       early_init_fdt_scan_reserved_mem();
-       memblock_allow_resize();
-       memblock_dump_all();
-
-       for_each_memblock(memory, reg) {
-               unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
-               unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
-
-               memblock_set_node(PFN_PHYS(start_pfn),
-                                 PFN_PHYS(end_pfn - start_pfn),
-                                 &memblock.memory, 0);
-       }
-}
-
 void __init setup_arch(char **cmdline_p)
 {
-       *cmdline_p = boot_command_line;
-
-       parse_early_param();
-
        init_mm.start_code = (unsigned long) _stext;
        init_mm.end_code   = (unsigned long) _etext;
        init_mm.end_data   = (unsigned long) _edata;
        init_mm.brk        = (unsigned long) _end;
 
+       *cmdline_p = boot_command_line;
+
+       parse_early_param();
+
        setup_bootmem();
        paging_init();
        unflatten_device_tree();
@@ -231,4 +101,3 @@ void __init setup_arch(char **cmdline_p)
 
        riscv_fill_hwcap();
 }
-
index 246635eac7bb5cd4652f4551007a642f37935836..0c41d07ec281e4a6c94e7a53b975c66ac2c44e49 100644 (file)
@@ -36,6 +36,15 @@ enum ipi_message_type {
        IPI_MAX
 };
 
+unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
+       [0 ... NR_CPUS-1] = INVALID_HARTID
+};
+
+void __init smp_setup_processor_id(void)
+{
+       cpuid_to_hartid_map(0) = boot_cpu_hartid;
+}
+
 /* A collection of single bit ipi messages.  */
 static struct {
        unsigned long stats[IPI_MAX] ____cacheline_aligned;
@@ -51,7 +60,6 @@ int riscv_hartid_to_cpuid(int hartid)
                        return i;
 
        pr_err("Couldn't find cpu id for hartid [%d]\n", hartid);
-       BUG();
        return i;
 }
 
index 18cda0e8cf9414310e0ec594ee30360f7938808c..eb533b5c2c8c04d6552664439602fcd204748b0c 100644 (file)
@@ -39,6 +39,7 @@
 
 void *__cpu_up_stack_pointer[NR_CPUS];
 void *__cpu_up_task_pointer[NR_CPUS];
+static DECLARE_COMPLETION(cpu_running);
 
 void __init smp_prepare_boot_cpu(void)
 {
@@ -50,12 +51,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 void __init setup_smp(void)
 {
-       struct device_node *dn = NULL;
+       struct device_node *dn;
        int hart;
        bool found_boot_cpu = false;
        int cpuid = 1;
 
-       while ((dn = of_find_node_by_type(dn, "cpu"))) {
+       for_each_of_cpu_node(dn) {
                hart = riscv_of_processor_hartid(dn);
                if (hart < 0)
                        continue;
@@ -65,6 +66,11 @@ void __init setup_smp(void)
                        found_boot_cpu = 1;
                        continue;
                }
+               if (cpuid >= NR_CPUS) {
+                       pr_warn("Invalid cpuid [%d] for hartid [%d]\n",
+                               cpuid, hart);
+                       break;
+               }
 
                cpuid_to_hartid_map(cpuid) = hart;
                set_cpu_possible(cpuid, true);
@@ -77,6 +83,7 @@ void __init setup_smp(void)
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
+       int ret = 0;
        int hartid = cpuid_to_hartid_map(cpu);
        tidle->thread_info.cpu = cpu;
 
@@ -92,10 +99,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
                  task_stack_page(tidle) + THREAD_SIZE);
        WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
 
-       while (!cpu_online(cpu))
-               cpu_relax();
+       lockdep_assert_held(&cpu_running);
+       wait_for_completion_timeout(&cpu_running,
+                                           msecs_to_jiffies(1000));
+
+       if (!cpu_online(cpu)) {
+               pr_crit("CPU%u: failed to come online\n", cpu);
+               ret = -EIO;
+       }
 
-       return 0;
+       return ret;
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
@@ -121,6 +134,7 @@ asmlinkage void __init smp_callin(void)
         * a local TLB flush right now just in case.
         */
        local_flush_tlb_all();
+       complete(&cpu_running);
        /*
         * Disable preemption before enabling interrupts, so we don't try to
         * schedule a CPU that hasn't actually started yet.
index 658ebf645f42002d536983f01e828312f0dce9fb..b379a75ac6a6778052b9161612357ba5df620648 100644 (file)
@@ -17,7 +17,9 @@
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
+#include <linux/of_fdt.h>
 
+#include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
@@ -66,7 +68,159 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+static void __init setup_initrd(void)
 {
+       unsigned long size;
+
+       if (initrd_start >= initrd_end) {
+               pr_info("initrd not found or empty");
+               goto disable;
+       }
+       if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
+               pr_err("initrd extends beyond end of memory");
+               goto disable;
+       }
+
+       size = initrd_end - initrd_start;
+       memblock_reserve(__pa(initrd_start), size);
+       initrd_below_start_ok = 1;
+
+       pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n",
+               (void *)(initrd_start), size);
+       return;
+disable:
+       pr_cont(" - disabling initrd\n");
+       initrd_start = 0;
+       initrd_end = 0;
+}
+
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+       free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
+
+void __init setup_bootmem(void)
+{
+       struct memblock_region *reg;
+       phys_addr_t mem_size = 0;
+
+       /* Find the memory region containing the kernel */
+       for_each_memblock(memory, reg) {
+               phys_addr_t vmlinux_end = __pa(_end);
+               phys_addr_t end = reg->base + reg->size;
+
+               if (reg->base <= vmlinux_end && vmlinux_end <= end) {
+                       /*
+                        * Reserve from the start of the region to the end of
+                        * the kernel
+                        */
+                       memblock_reserve(reg->base, vmlinux_end - reg->base);
+                       mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
+               }
+       }
+       BUG_ON(mem_size == 0);
+
+       set_max_mapnr(PFN_DOWN(mem_size));
+       max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       setup_initrd();
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+       early_init_fdt_reserve_self();
+       early_init_fdt_scan_reserved_mem();
+       memblock_allow_resize();
+       memblock_dump_all();
+
+       for_each_memblock(memory, reg) {
+               unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
+               unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
+
+               memblock_set_node(PFN_PHYS(start_pfn),
+                                 PFN_PHYS(end_pfn - start_pfn),
+                                 &memblock.memory, 0);
+       }
+}
+
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
+pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
+pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
+pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
+#endif
+
+pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
+{
+       unsigned long addr = __fix_to_virt(idx);
+       pte_t *ptep;
+
+       BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
+
+       ptep = &fixmap_pte[pte_index(addr)];
+
+       if (pgprot_val(prot)) {
+               set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
+       } else {
+               pte_clear(&init_mm, addr, ptep);
+               local_flush_tlb_page(addr);
+       }
+}
+
+asmlinkage void __init setup_vm(void)
+{
+       extern char _start;
+       uintptr_t i;
+       uintptr_t pa = (uintptr_t) &_start;
+       pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
+
+       va_pa_offset = PAGE_OFFSET - pa;
+       pfn_base = PFN_DOWN(pa);
+
+       /* Sanity check alignment and size */
+       BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
+       BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+       trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
+               pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd),
+                       __pgprot(_PAGE_TABLE));
+       trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot);
+
+       for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
+               size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+
+               swapper_pg_dir[o] =
+                       pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
+                               __pgprot(_PAGE_TABLE));
+       }
+       for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
+               swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
+
+       swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
+               pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pmd),
+                               __pgprot(_PAGE_TABLE));
+       fixmap_pmd[(FIXADDR_START >> PMD_SHIFT) % PTRS_PER_PMD] =
+               pfn_pmd(PFN_DOWN((uintptr_t)fixmap_pte),
+                               __pgprot(_PAGE_TABLE));
+#else
+       trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
+               pfn_pgd(PFN_DOWN(pa), prot);
+
+       for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
+               size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+
+               swapper_pg_dir[o] =
+                       pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
+       }
+
+       swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
+               pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pte),
+                               __pgprot(_PAGE_TABLE));
+#endif
+}
index 9c5a67d1b9c1b1d40e853b0ac902670465973974..2d8b9d8ca4f8753291bb1487fb0b77d7b6009280 100644 (file)
@@ -187,7 +187,6 @@ cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,
 cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
 
 # does binutils support specific instructions?
-asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
 asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
@@ -217,6 +216,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 # Avoid indirect branches in kernel to deal with Spectre
 ifdef CONFIG_RETPOLINE
   KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+  # Additionally, avoid generating expensive indirect jumps which
+  # are subject to retpolines for small number of switch cases.
+  # clang turns off jump table generation by default when under
+  # retpoline builds, however, gcc does not for x86.
+  KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
 endif
 
 archscripts: scripts_basic
index 9b5adae9cc40cf59a5c8244bb3eb361a4772b626..e2839b5c246c21e45ee2f783e92c2abb04bb93ea 100644 (file)
@@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
 AFLAGS_header.o += -I$(objtree)/$(obj)
 $(obj)/header.o: $(obj)/zoffset.h
 
-LDFLAGS_setup.elf      := -T
+LDFLAGS_setup.elf      := -m elf_i386 -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
        $(call if_changed,ld)
 
index f0515ac895a43d77443a861e4068fb1656429fe5..6b84afdd75382c2ee3d0aa0f514784c2fd9e87ea 100644 (file)
@@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
        vmlinux-objs-y += $(obj)/pgtable_64.o
 endif
 
+vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
 vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
new file mode 100644 (file)
index 0000000..0ef4ad5
--- /dev/null
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BOOT_CTYPE_H
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+
+#include <linux/numa.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+/*
+ * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
+ * for termination.
+ */
+#define MAX_ACPI_ARG_LENGTH 10
+
+/*
+ * Immovable memory regions representation. Max amount of memory regions is
+ * MAX_NUMNODES*2.
+ */
+struct mem_vector immovable_mem[MAX_NUMNODES*2];
+
+/*
+ * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
+ * digits, and '\0' for termination.
+ */
+#define MAX_ADDR_LEN 19
+
+static acpi_physical_address get_acpi_rsdp(void)
+{
+       acpi_physical_address addr = 0;
+
+#ifdef CONFIG_KEXEC
+       char val[MAX_ADDR_LEN] = { };
+       int ret;
+
+       ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
+       if (ret < 0)
+               return 0;
+
+       if (kstrtoull(val, 16, &addr))
+               return 0;
+#endif
+       return addr;
+}
+
+/* Search EFI system tables for RSDP. */
+static acpi_physical_address efi_get_rsdp_addr(void)
+{
+       acpi_physical_address rsdp_addr = 0;
+
+#ifdef CONFIG_EFI
+       unsigned long systab, systab_tables, config_tables;
+       unsigned int nr_tables;
+       struct efi_info *ei;
+       bool efi_64;
+       int size, i;
+       char *sig;
+
+       ei = &boot_params->efi_info;
+       sig = (char *)&ei->efi_loader_signature;
+
+       if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+               efi_64 = true;
+       } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+               efi_64 = false;
+       } else {
+               debug_putstr("Wrong EFI loader signature.\n");
+               return 0;
+       }
+
+       /* Get systab from boot params. */
+#ifdef CONFIG_X86_64
+       systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+       if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+               debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n");
+               return 0;
+       }
+       systab = ei->efi_systab;
+#endif
+       if (!systab)
+               error("EFI system table not found.");
+
+       /* Handle EFI bitness properly */
+       if (efi_64) {
+               efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab;
+
+               config_tables   = stbl->tables;
+               nr_tables       = stbl->nr_tables;
+               size            = sizeof(efi_config_table_64_t);
+       } else {
+               efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab;
+
+               config_tables   = stbl->tables;
+               nr_tables       = stbl->nr_tables;
+               size            = sizeof(efi_config_table_32_t);
+       }
+
+       if (!config_tables)
+               error("EFI config tables not found.");
+
+       /* Get EFI tables from systab. */
+       for (i = 0; i < nr_tables; i++) {
+               acpi_physical_address table;
+               efi_guid_t guid;
+
+               config_tables += size;
+
+               if (efi_64) {
+                       efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables;
+
+                       guid  = tbl->guid;
+                       table = tbl->table;
+
+                       if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
+                               debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n");
+                               return 0;
+                       }
+               } else {
+                       efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables;
+
+                       guid  = tbl->guid;
+                       table = tbl->table;
+               }
+
+               if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
+                       rsdp_addr = table;
+               else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
+                       return table;
+       }
+#endif
+       return rsdp_addr;
+}
+
+static u8 compute_checksum(u8 *buffer, u32 length)
+{
+       u8 *end = buffer + length;
+       u8 sum = 0;
+
+       while (buffer < end)
+               sum += *(buffer++);
+
+       return sum;
+}
+
+/* Search a block of memory for the RSDP signature. */
+static u8 *scan_mem_for_rsdp(u8 *start, u32 length)
+{
+       struct acpi_table_rsdp *rsdp;
+       u8 *address, *end;
+
+       end = start + length;
+
+       /* Search from given start address for the requested length */
+       for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) {
+               /*
+                * Both RSDP signature and checksum must be correct.
+                * Note: Sometimes there exists more than one RSDP in memory;
+                * the valid RSDP has a valid checksum, all others have an
+                * invalid checksum.
+                */
+               rsdp = (struct acpi_table_rsdp *)address;
+
+               /* BAD Signature */
+               if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature))
+                       continue;
+
+               /* Check the standard checksum */
+               if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH))
+                       continue;
+
+               /* Check extended checksum if table version >= 2 */
+               if ((rsdp->revision >= 2) &&
+                   (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)))
+                       continue;
+
+               /* Signature and checksum valid, we have found a real RSDP */
+               return address;
+       }
+       return NULL;
+}
+
+/* Search RSDP address in EBDA. */
+static acpi_physical_address bios_get_rsdp_addr(void)
+{
+       unsigned long address;
+       u8 *rsdp;
+
+       /* Get the location of the Extended BIOS Data Area (EBDA) */
+       address = *(u16 *)ACPI_EBDA_PTR_LOCATION;
+       address <<= 4;
+
+       /*
+        * Search EBDA paragraphs (EBDA is required to be a minimum of
+        * 1K length)
+        */
+       if (address > 0x400) {
+               rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE);
+               if (rsdp)
+                       return (acpi_physical_address)(unsigned long)rsdp;
+       }
+
+       /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */
+       rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE,
+                                       ACPI_HI_RSDP_WINDOW_SIZE);
+       if (rsdp)
+               return (acpi_physical_address)(unsigned long)rsdp;
+
+       return 0;
+}
+
+/* Return RSDP address on success, otherwise 0. */
+acpi_physical_address get_rsdp_addr(void)
+{
+       acpi_physical_address pa;
+
+       pa = get_acpi_rsdp();
+
+       if (!pa)
+               pa = boot_params->acpi_rsdp_addr;
+
+       if (!pa)
+               pa = efi_get_rsdp_addr();
+
+       if (!pa)
+               pa = bios_get_rsdp_addr();
+
+       return pa;
+}
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
+/* Compute SRAT address from RSDP. */
+static unsigned long get_acpi_srat_table(void)
+{
+       unsigned long root_table, acpi_table;
+       struct acpi_table_header *header;
+       struct acpi_table_rsdp *rsdp;
+       u32 num_entries, size, len;
+       char arg[10];
+       u8 *entry;
+
+       rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr;
+       if (!rsdp)
+               return 0;
+
+       /* Get ACPI root table from RSDP.*/
+       if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
+           !strncmp(arg, "rsdt", 4)) &&
+           rsdp->xsdt_physical_address &&
+           rsdp->revision > 1) {
+               root_table = rsdp->xsdt_physical_address;
+               size = ACPI_XSDT_ENTRY_SIZE;
+       } else {
+               root_table = rsdp->rsdt_physical_address;
+               size = ACPI_RSDT_ENTRY_SIZE;
+       }
+
+       if (!root_table)
+               return 0;
+
+       header = (struct acpi_table_header *)root_table;
+       len = header->length;
+       if (len < sizeof(struct acpi_table_header) + size)
+               return 0;
+
+       num_entries = (len - sizeof(struct acpi_table_header)) / size;
+       entry = (u8 *)(root_table + sizeof(struct acpi_table_header));
+
+       while (num_entries--) {
+               if (size == ACPI_RSDT_ENTRY_SIZE)
+                       acpi_table = *(u32 *)entry;
+               else
+                       acpi_table = *(u64 *)entry;
+
+               if (acpi_table) {
+                       header = (struct acpi_table_header *)acpi_table;
+
+                       if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT))
+                               return acpi_table;
+               }
+               entry += size;
+       }
+       return 0;
+}
+
+/**
+ * count_immovable_mem_regions - Parse SRAT and cache the immovable
+ * memory regions into the immovable_mem array.
+ *
+ * Return the number of immovable memory regions on success, 0 on failure:
+ *
+ * - Too many immovable memory regions
+ * - ACPI off or no SRAT found
+ * - No immovable memory region found.
+ */
+int count_immovable_mem_regions(void)
+{
+       unsigned long table_addr, table_end, table;
+       struct acpi_subtable_header *sub_table;
+       struct acpi_table_header *table_header;
+       char arg[MAX_ACPI_ARG_LENGTH];
+       int num = 0;
+
+       if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
+           !strncmp(arg, "off", 3))
+               return 0;
+
+       table_addr = get_acpi_srat_table();
+       if (!table_addr)
+               return 0;
+
+       table_header = (struct acpi_table_header *)table_addr;
+       table_end = table_addr + table_header->length;
+       table = table_addr + sizeof(struct acpi_table_srat);
+
+       while (table + sizeof(struct acpi_subtable_header) < table_end) {
+               sub_table = (struct acpi_subtable_header *)table;
+               if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
+                       struct acpi_srat_mem_affinity *ma;
+
+                       ma = (struct acpi_srat_mem_affinity *)sub_table;
+                       if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
+                               immovable_mem[num].start = ma->base_address;
+                               immovable_mem[num].size = ma->length;
+                               num++;
+                       }
+
+                       if (num >= MAX_NUMNODES*2) {
+                               debug_putstr("Too many immovable memory regions, aborting.\n");
+                               return 0;
+                       }
+               }
+               table += sub_table->length;
+       }
+       return num;
+}
+#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
index af6cda0b7900f02ef9d975a3b484236c1dbaa0c9..f1add5d85da9d9881e44e2316a2f0018deb364f7 100644 (file)
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "misc.h"
 
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
-
 static unsigned long fs;
 static inline void set_fs(unsigned long seg)
 {
@@ -30,5 +28,3 @@ int cmdline_find_option_bool(const char *option)
 {
        return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
 }
-
-#endif
index f62e347862ccc61ba417d80dabee304ef28b6ec7..fafb75c6c59253986fc39237abb85cbd43db04b1 100644 (file)
@@ -358,8 +358,11 @@ ENTRY(startup_64)
         * paging_prepare() sets up the trampoline and checks if we need to
         * enable 5-level paging.
         *
-        * Address of the trampoline is returned in RAX.
-        * Non zero RDX on return means we need to enable 5-level paging.
+        * paging_prepare() returns a two-quadword structure which lands
+        * into RDX:RAX:
+        *   - Address of the trampoline is returned in RAX.
+        *   - Non zero RDX means trampoline needs to enable 5-level
+        *     paging.
         *
         * RSI holds real mode data and needs to be preserved across
         * this function call.
@@ -565,7 +568,7 @@ adjust_got:
  *
  * RDI contains the return address (might be above 4G).
  * ECX contains the base address of the trampoline memory.
- * Non zero RDX on return means we need to enable 5-level paging.
+ * Non zero RDX means trampoline needs to enable 5-level paging.
  */
 ENTRY(trampoline_32bit_src)
        /* Set up data and stack segments */
@@ -655,8 +658,6 @@ no_longmode:
        .data
 gdt64:
        .word   gdt_end - gdt
-       .long   0
-       .word   0
        .quad   0
 gdt:
        .word   gdt_end - gdt
index 9ed9709d9947a58eb8b4ce99c7640e47ef8e0079..fa0332dda9f2cf67d55b143b08bf7e4402821971 100644 (file)
@@ -87,10 +87,6 @@ static unsigned long get_boot_seed(void)
 #define KASLR_COMPRESSED_BOOT
 #include "../../lib/kaslr.c"
 
-struct mem_vector {
-       unsigned long long start;
-       unsigned long long size;
-};
 
 /* Only supporting at most 4 unusable memmap regions with kaslr */
 #define MAX_MEMMAP_REGIONS     4
@@ -101,6 +97,8 @@ static bool memmap_too_large;
 /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
 static unsigned long long mem_limit = ULLONG_MAX;
 
+/* Number of immovable memory regions */
+static int num_immovable_mem;
 
 enum mem_avoid_index {
        MEM_AVOID_ZO_RANGE = 0,
@@ -417,6 +415,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
        /* Mark the memmap regions we need to avoid */
        handle_mem_options();
 
+       /* Enumerate the immovable memory regions */
+       num_immovable_mem = count_immovable_mem_regions();
+
 #ifdef CONFIG_X86_VERBOSE_BOOTUP
        /* Make sure video RAM can be used. */
        add_identity_map(0, PMD_SIZE);
@@ -572,9 +573,9 @@ static unsigned long slots_fetch_random(void)
        return 0;
 }
 
-static void process_mem_region(struct mem_vector *entry,
-                              unsigned long minimum,
-                              unsigned long image_size)
+static void __process_mem_region(struct mem_vector *entry,
+                                unsigned long minimum,
+                                unsigned long image_size)
 {
        struct mem_vector region, overlap;
        unsigned long start_orig, end;
@@ -650,6 +651,56 @@ static void process_mem_region(struct mem_vector *entry,
        }
 }
 
+static bool process_mem_region(struct mem_vector *region,
+                              unsigned long long minimum,
+                              unsigned long long image_size)
+{
+       int i;
+       /*
+        * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
+        * use @region directly.
+        */
+       if (!num_immovable_mem) {
+               __process_mem_region(region, minimum, image_size);
+
+               if (slot_area_index == MAX_SLOT_AREA) {
+                       debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
+                       return 1;
+               }
+               return 0;
+       }
+
+#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+       /*
+        * If immovable memory found, filter the intersection between
+        * immovable memory and @region.
+        */
+       for (i = 0; i < num_immovable_mem; i++) {
+               unsigned long long start, end, entry_end, region_end;
+               struct mem_vector entry;
+
+               if (!mem_overlaps(region, &immovable_mem[i]))
+                       continue;
+
+               start = immovable_mem[i].start;
+               end = start + immovable_mem[i].size;
+               region_end = region->start + region->size;
+
+               entry.start = clamp(region->start, start, end);
+               entry_end = clamp(region_end, start, end);
+               entry.size = entry_end - entry.start;
+
+               __process_mem_region(&entry, minimum, image_size);
+
+               if (slot_area_index == MAX_SLOT_AREA) {
+                       debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
+                       return 1;
+               }
+       }
+       return 0;
+#endif
+}
+
 #ifdef CONFIG_EFI
 /*
  * Returns true if mirror region found (and must have been processed
@@ -715,11 +766,8 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
 
                region.start = md->phys_addr;
                region.size = md->num_pages << EFI_PAGE_SHIFT;
-               process_mem_region(&region, minimum, image_size);
-               if (slot_area_index == MAX_SLOT_AREA) {
-                       debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+               if (process_mem_region(&region, minimum, image_size))
                        break;
-               }
        }
        return true;
 }
@@ -746,11 +794,8 @@ static void process_e820_entries(unsigned long minimum,
                        continue;
                region.start = entry->addr;
                region.size = entry->size;
-               process_mem_region(&region, minimum, image_size);
-               if (slot_area_index == MAX_SLOT_AREA) {
-                       debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+               if (process_mem_region(&region, minimum, image_size))
                        break;
-               }
        }
 }
 
index 8dd1d5ccae58023fb7cb1e3c0cf83255b6b1988b..c0d6c560df69e0e63941a34539660770304ff612 100644 (file)
@@ -351,6 +351,9 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
        /* Clear flags intended for solely in-kernel use. */
        boot_params->hdr.loadflags &= ~KASLR_FLAG;
 
+       /* Save RSDP address for later use. */
+       boot_params->acpi_rsdp_addr = get_rsdp_addr();
+
        sanitize_boot_params(boot_params);
 
        if (boot_params->screen_info.orig_video_mode == 7) {
index a1d5918765f36be04d95cd503a1378919258652a..fd13655e0f9b016baba58a3ed41832284d866fe3 100644 (file)
@@ -25,6 +25,9 @@
 #include <asm/bootparam.h>
 #include <asm/bootparam_utils.h>
 
+#define BOOT_CTYPE_H
+#include <linux/acpi.h>
+
 #define BOOT_BOOT_H
 #include "../ctype.h"
 
@@ -63,12 +66,14 @@ static inline void debug_puthex(const char *s)
 
 #endif
 
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
-#endif
 
+struct mem_vector {
+       unsigned long long start;
+       unsigned long long size;
+};
 
 #if CONFIG_RANDOMIZE_BASE
 /* kaslr.c */
@@ -116,3 +121,17 @@ static inline void console_init(void)
 void set_sev_encryption_mask(void);
 
 #endif
+
+/* acpi.c */
+#ifdef CONFIG_ACPI
+acpi_physical_address get_rsdp_addr(void);
+#else
+static inline acpi_physical_address get_rsdp_addr(void) { return 0; }
+#endif
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+extern struct mem_vector immovable_mem[MAX_NUMNODES*2];
+int count_immovable_mem_regions(void);
+#else
+static inline int count_immovable_mem_regions(void) { return 0; }
+#endif
index 9e215737149103dcb627ebc9138a9b7126b24f99..f8debf7aeb4c144b286c8a12570cad23d46decc8 100644 (file)
@@ -1,5 +1,7 @@
+#include <linux/efi.h>
 #include <asm/e820/types.h>
 #include <asm/processor.h>
+#include <asm/efi.h>
 #include "pgtable.h"
 #include "../string.h"
 
@@ -37,9 +39,10 @@ int cmdline_find_option_bool(const char *option);
 
 static unsigned long find_trampoline_placement(void)
 {
-       unsigned long bios_start, ebda_start;
+       unsigned long bios_start = 0, ebda_start = 0;
        unsigned long trampoline_start;
        struct boot_e820_entry *entry;
+       char *signature;
        int i;
 
        /*
@@ -47,8 +50,18 @@ static unsigned long find_trampoline_placement(void)
         * This code is based on reserve_bios_regions().
         */
 
-       ebda_start = *(unsigned short *)0x40e << 4;
-       bios_start = *(unsigned short *)0x413 << 10;
+       /*
+        * EFI systems may not provide legacy ROM. The memory may not be mapped
+        * at all.
+        *
+        * Only look for values in the legacy ROM for non-EFI system.
+        */
+       signature = (char *)&boot_params->efi_info.efi_loader_signature;
+       if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+           strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
+               ebda_start = *(unsigned short *)0x40e << 4;
+               bios_start = *(unsigned short *)0x413 << 10;
+       }
 
        if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
                bios_start = BIOS_START_MAX;
index f491bbde8493142187aa7d9db3d42944d6378d0c..508cfa6828c5d88cd8f616e28df686ec625dd5f5 100644 (file)
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <asm-generic/vmlinux.lds.h>
 
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
 
 #undef i386
 
index 96a6c7563538364d2dee7e307846815156c11c33..0149e41d42c270c2a9337f2890e698c9870eb33c 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Linker script for the i386 setup code
  */
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_FORMAT("elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(_start)
 
index c4428a176973311950429e0ba00c4ce13e9f6fe6..315a67b8896b9588c78b6128014f4702cdf965af 100644 (file)
  */
 
 #include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
 #include <asm/asm.h>
 #include "ctype.h"
 #include "string.h"
 
+#define KSTRTOX_OVERFLOW       (1U << 31)
+
 /*
  * Undef these macros so that the functions that we provide
  * here will have the correct names regardless of how string.h
@@ -187,3 +191,140 @@ char *strchr(const char *s, int c)
                        return NULL;
        return (char *)s;
 }
+
+static inline u64 __div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+       union {
+               u64 v64;
+               u32 v32[2];
+       } d = { dividend };
+       u32 upper;
+
+       upper = d.v32[1];
+       d.v32[1] = 0;
+       if (upper >= divisor) {
+               d.v32[1] = upper / divisor;
+               upper %= divisor;
+       }
+       asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+               "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+       return d.v64;
+}
+
+static inline u64 __div_u64(u64 dividend, u32 divisor)
+{
+       u32 remainder;
+
+       return __div_u64_rem(dividend, divisor, &remainder);
+}
+
+static inline char _tolower(const char c)
+{
+       return c | 0x20;
+}
+
+static const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+       if (*base == 0) {
+               if (s[0] == '0') {
+                       if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+                               *base = 16;
+                       else
+                               *base = 8;
+               } else
+                       *base = 10;
+       }
+       if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+               s += 2;
+       return s;
+}
+
+/*
+ * Convert non-negative integer string representation in explicitly given radix
+ * to an integer.
+ * Return number of characters consumed maybe or-ed with overflow bit.
+ * If overflow occurs, result integer (incorrect) is still returned.
+ *
+ * Don't you dare use this function.
+ */
+static unsigned int _parse_integer(const char *s,
+                                  unsigned int base,
+                                  unsigned long long *p)
+{
+       unsigned long long res;
+       unsigned int rv;
+
+       res = 0;
+       rv = 0;
+       while (1) {
+               unsigned int c = *s;
+               unsigned int lc = c | 0x20; /* don't tolower() this line */
+               unsigned int val;
+
+               if ('0' <= c && c <= '9')
+                       val = c - '0';
+               else if ('a' <= lc && lc <= 'f')
+                       val = lc - 'a' + 10;
+               else
+                       break;
+
+               if (val >= base)
+                       break;
+               /*
+                * Check for overflow only if we are within range of
+                * it in the max base we support (16)
+                */
+               if (unlikely(res & (~0ull << 60))) {
+                       if (res > __div_u64(ULLONG_MAX - val, base))
+                               rv |= KSTRTOX_OVERFLOW;
+               }
+               res = res * base + val;
+               rv++;
+               s++;
+       }
+       *p = res;
+       return rv;
+}
+
+static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+       unsigned long long _res;
+       unsigned int rv;
+
+       s = _parse_integer_fixup_radix(s, &base);
+       rv = _parse_integer(s, base, &_res);
+       if (rv & KSTRTOX_OVERFLOW)
+               return -ERANGE;
+       if (rv == 0)
+               return -EINVAL;
+       s += rv;
+       if (*s == '\n')
+               s++;
+       if (*s)
+               return -EINVAL;
+       *res = _res;
+       return 0;
+}
+
+/**
+ * kstrtoull - convert a string to an unsigned long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+       if (s[0] == '+')
+               s++;
+       return _kstrtoull(s, base, res);
+}
index 3d78e27077f414ef671643d44580ec015c936951..38d8f2f5e47e2bf66ff9bb02a994f848dde511b8 100644 (file)
@@ -29,4 +29,5 @@ extern unsigned int atou(const char *s);
 extern unsigned long long simple_strtoull(const char *cp, char **endp,
                                          unsigned int base);
 
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
 #endif /* BOOT_STRING_H */
index 7ca67c482f4ca1d6fc510863be1f5ff796c24185..9f908112bbb97e35b87aaf4a440fbd8d47c742e7 100644 (file)
@@ -309,3 +309,5 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
 CONFIG_CRYPTO_AES_586=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_EFI_STUB=y
+CONFIG_ACPI_BGRT=y
index 5d42a20e0986ee212b8fbf3821ca1ca8740b4544..1d3badfda09ee86d5119599a5ad9d8d9e4960ba3 100644 (file)
@@ -307,3 +307,6 @@ CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_EFI_STUB=y
+CONFIG_EFI_MIXED=y
+CONFIG_ACPI_BGRT=y
index 17096d3cd616483b0958a90c98f18c4397e9dc0b..7ec265bacb6a2622ceba7d42dcc98ebca7b2ec31 100644 (file)
@@ -4220,6 +4220,8 @@ __init int intel_pmu_init(void)
 
        case INTEL_FAM6_CORE2_MEROM:
                x86_add_quirk(intel_clovertown_quirk);
+               /* fall through */
+
        case INTEL_FAM6_CORE2_MEROM_L:
        case INTEL_FAM6_CORE2_PENRYN:
        case INTEL_FAM6_CORE2_DUNNINGTON:
index c88ed39582a10095b41b364bf6532b0f5298c6e2..580c1b91c454024cf6062b8c1013ac1f8a1d5e5a 100644 (file)
@@ -931,6 +931,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
                        ret = X86_BR_ZERO_CALL;
                        break;
                }
+               /* fall through */
        case 0x9a: /* call far absolute */
                ret = X86_BR_CALL;
                break;
index 1908214b91257f1d2442e2e6fc08b2f4c4f5bf65..ce92c4acc913368b8383d85555c6fcac69fcb470 100644 (file)
@@ -7,7 +7,6 @@
 
 #include <asm-generic/asm-prototypes.h>
 
-#include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
index fa2c93cb42a27e9eecd3774a6fba2cef9dffa334..fb04a3ded7ddb2ab284404f0caf0f1e6b1af23aa 100644 (file)
@@ -137,37 +137,25 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
 {
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-       else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+       else
                return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
 
-       /* See comment in copy_fxregs_to_kernel() below. */
-       return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
 }
 
 static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 {
-       if (IS_ENABLED(CONFIG_X86_32)) {
+       if (IS_ENABLED(CONFIG_X86_32))
                kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-       } else {
-               if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
-                       kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-               } else {
-                       /* See comment in copy_fxregs_to_kernel() below. */
-                       kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
-               }
-       }
+       else
+               kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
 static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
 {
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-       else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+       else
                return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-       /* See comment in copy_fxregs_to_kernel() below. */
-       return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-                         "m" (*fx));
 }
 
 static inline void copy_kernel_to_fregs(struct fregs_state *fx)
@@ -184,34 +172,8 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 {
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
-       else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+       else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
-       else {
-               /* Using "rex64; fxsave %0" is broken because, if the memory
-                * operand uses any extended registers for addressing, a second
-                * REX prefix will be generated (to the assembler, rex64
-                * followed by semicolon is a separate instruction), and hence
-                * the 64-bitness is lost.
-                *
-                * Using "fxsaveq %0" would be the ideal choice, but is only
-                * supported starting with gas 2.16.
-                *
-                * Using, as a workaround, the properly prefixed form below
-                * isn't accepted by any binutils version so far released,
-                * complaining that the same type of prefix is used twice if
-                * an extended register is needed for addressing (fix submitted
-                * to mainline 2005-11-21).
-                *
-                *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
-                *
-                * This, however, we can work around by forcing the compiler to
-                * select an addressing mode that doesn't require extended
-                * registers.
-                */
-               asm volatile( "rex64/fxsave (%[fx])"
-                            : "=m" (fpu->state.fxsave)
-                            : [fx] "R" (&fpu->state.fxsave));
-       }
 }
 
 /* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -414,6 +376,13 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
        if (likely(use_xsave())) {
                copy_xregs_to_kernel(&fpu->state.xsave);
+
+               /*
+                * AVX512 state is tracked here because its use is
+                * known to slow the max clock speed of the core.
+                */
+               if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+                       fpu->avx512_timestamp = jiffies;
                return 1;
        }
 
index 202c53918ecfa6d82c616eac1776ee88c5f81426..2e32e178e064592221b14133bf4724591950ec96 100644 (file)
@@ -302,6 +302,13 @@ struct fpu {
         */
        unsigned char                   initialized;
 
+       /*
+        * @avx512_timestamp:
+        *
+        * Records the timestamp of AVX512 use during last context switch.
+        */
+       unsigned long                   avx512_timestamp;
+
        /*
         * @state:
         *
index 9c85b54bf03cad528c20e98df3bc31771b6e5e25..0bb5663156218045f025e9ba3ad1abb1c18564fb 100644 (file)
@@ -259,8 +259,7 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
 
 #define gup_fast_permitted gup_fast_permitted
-static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
-               int write)
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages)
 {
        unsigned long len, end;
 
index 33051436c86459131a23005b67ecfdaa952cc123..2bb3a648fc12c966951caa1443063a972ef56e1a 100644 (file)
@@ -742,7 +742,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-void early_trap_pf_init(void);
 
 /* Defined in head.S */
 extern struct desc_ptr         early_gdt_descr;
index 62004d22524a7cea9fa04ae83f1610e4ba5d6985..1954dd5552a2e2fbeaf21937ad4c6d98c6ba0aff 100644 (file)
@@ -34,10 +34,7 @@ static inline void set_fs(mm_segment_t fs)
 }
 
 #define segment_eq(a, b)       ((a).seg == (b).seg)
-
 #define user_addr_max() (current->thread.addr_limit.seg)
-#define __addr_ok(addr)        \
-       ((unsigned long __force)(addr) < user_addr_max())
 
 /*
  * Test whether a block of memory is a valid user space address.
index 3f697a9e3f59b37e4fa37cf0f672114a2d84312b..8cfccc3cbbf42eb59f9b815f21164785d515b53c 100644 (file)
@@ -141,7 +141,6 @@ enum uv_memprotect {
  */
 extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
 extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
 
 extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
 extern s64 uv_bios_freq_base(u64, u64 *);
@@ -152,11 +151,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
 extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
 extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
 
-#ifdef CONFIG_EFI
 extern void uv_bios_init(void);
-#else
-void uv_bios_init(void) { }
-#endif
 
 extern unsigned long sn_rtc_cycles_per_second;
 extern int uv_type;
index 0c26b1b44e51aeffb72a6daa151049880f52892f..4203d4f0c68d02e826a824873eaba1fdcf466eb3 100644 (file)
@@ -90,7 +90,7 @@ ret_point:
 .data
 ALIGN
 ENTRY(saved_magic)     .long   0
-ENTRY(saved_eip)       .long   0
+saved_eip:             .long 0
 
 # saved registers
 saved_idt:     .long   0,0
index 50b8ed0317a34bb1a5d0fb9296462775f62a11bc..510fa12aab73a7232ee6287df33f79227000b4be 100644 (file)
@@ -125,12 +125,12 @@ ENTRY(do_suspend_lowlevel)
 ENDPROC(do_suspend_lowlevel)
 
 .data
-ENTRY(saved_rbp)       .quad   0
-ENTRY(saved_rsi)       .quad   0
-ENTRY(saved_rdi)       .quad   0
-ENTRY(saved_rbx)       .quad   0
+saved_rbp:             .quad   0
+saved_rsi:             .quad   0
+saved_rdi:             .quad   0
+saved_rbx:             .quad   0
 
-ENTRY(saved_rip)       .quad   0
-ENTRY(saved_rsp)       .quad   0
+saved_rip:             .quad   0
+saved_rsp:             .quad   0
 
 ENTRY(saved_magic)     .quad   0
index 2953bbf05c0857e36fc11ec7a7ae272e9e63bb63..264e3221d9233eaf0505338ae6f21c365540f1d6 100644 (file)
@@ -812,6 +812,7 @@ static int irq_polarity(int idx)
                return IOAPIC_POL_HIGH;
        case MP_IRQPOL_RESERVED:
                pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+               /* fall through */
        case MP_IRQPOL_ACTIVE_LOW:
        default: /* Pointless default required due to do gcc stupidity */
                return IOAPIC_POL_LOW;
@@ -859,6 +860,7 @@ static int irq_trigger(int idx)
                return IOAPIC_EDGE;
        case MP_IRQTRIG_RESERVED:
                pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+               /* fall through */
        case MP_IRQTRIG_LEVEL:
        default: /* Pointless default required due to do gcc stupidity */
                return IOAPIC_LEVEL;
index c4d1023fb0abc5fa570c2c3202d229599b9a0d64..395d46f78582bce47fc65a3629bcaee0d9ece52d 100644 (file)
@@ -248,6 +248,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
        switch (leaf) {
        case 1:
                l1 = &l1i;
+               /* fall through */
        case 0:
                if (!l1->val)
                        return;
index 3668c5df90c6997737ddaf5de028e51d943080aa..5bd011737272d48511c1f5218f19768661673dd1 100644 (file)
@@ -296,7 +296,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
                        unsigned long sizek)
 {
        unsigned long hole_basek, hole_sizek;
-       unsigned long second_basek, second_sizek;
+       unsigned long second_sizek;
        unsigned long range0_basek, range0_sizek;
        unsigned long range_basek, range_sizek;
        unsigned long chunk_sizek;
@@ -304,7 +304,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 
        hole_basek = 0;
        hole_sizek = 0;
-       second_basek = 0;
        second_sizek = 0;
        chunk_sizek = state->chunk_sizek;
        gran_sizek = state->gran_sizek;
index 14bed6af837735111113511d359dac8cb847bed4..604c0e3bcc830612fb505efe1a677ee0df7aac4e 100644 (file)
 #define CREATE_TRACE_POINTS
 #include "pseudo_lock_event.h"
 
-/*
- * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
- * prefetcher state. Details about this register can be found in the MSR
- * tables for specific platforms found in Intel's SDM.
- */
-#define MSR_MISC_FEATURE_CONTROL       0x000001a4
-
 /*
  * The bits needed to disable hardware prefetching varies based on the
  * platform. During initialization we will discover which bits to use.
index 50895c2f937d144f9fd2b133c0522aadc7841c9e..a687d10da4178737afcb09b5dd24f6fea3430057 100644 (file)
@@ -671,21 +671,18 @@ __init void e820__reallocate_tables(void)
        int size;
 
        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
-       n = kmalloc(size, GFP_KERNEL);
+       n = kmemdup(e820_table, size, GFP_KERNEL);
        BUG_ON(!n);
-       memcpy(n, e820_table, size);
        e820_table = n;
 
        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
-       n = kmalloc(size, GFP_KERNEL);
+       n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
        BUG_ON(!n);
-       memcpy(n, e820_table_kexec, size);
        e820_table_kexec = n;
 
        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
-       n = kmalloc(size, GFP_KERNEL);
+       n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
        BUG_ON(!n);
-       memcpy(n, e820_table_firmware, size);
        e820_table_firmware = n;
 }
 
index 9cc108456d0be4b2dc52c4e2b4a98c66ddd101eb..d7432c2b105147d9f165e855a46199c5ce9751ae 100644 (file)
@@ -669,7 +669,7 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size)
        return false;
 }
 
-static int init_xstate_size(void)
+static int __init init_xstate_size(void)
 {
        /* Recompute the context size for enabled features: */
        unsigned int possible_xstate_size;
index 34a5c171514870af79195679ad9bb90741a57992..ff9bfd40429efeb7b4868d370628356e28265ec1 100644 (file)
@@ -261,12 +261,8 @@ static int arch_build_bp_info(struct perf_event *bp,
                 * allow kernel breakpoints at all.
                 */
                if (attr->bp_addr >= TASK_SIZE_MAX) {
-#ifdef CONFIG_KPROBES
                        if (within_kprobe_blacklist(attr->bp_addr))
                                return -EINVAL;
-#else
-                       return -EINVAL;
-#endif
                }
 
                hw->type = X86_BREAKPOINT_EXECUTE;
@@ -279,6 +275,7 @@ static int arch_build_bp_info(struct perf_event *bp,
                        hw->len = X86_BREAKPOINT_LEN_X;
                        return 0;
                }
+               /* fall through */
        default:
                return -EINVAL;
        }
index 53917a3ebf949632b33617482549265cd08c6ceb..1f3b77367948d4abd67c7c0f4d4e6bfff85fb380 100644 (file)
@@ -218,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
        params->screen_info.ext_mem_k = 0;
        params->alt_mem_k = 0;
 
+       /* Always fill in RSDP: it is either 0 or a valid value */
+       params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
+
        /* Default APM info */
        memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
 
@@ -256,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
        setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
                        efi_setup_data_offset);
 #endif
-
        /* Setup EDD info */
        memcpy(params->eddbuf, boot_params.eddbuf,
                                EDDMAXNR * sizeof(struct edd_info));
index 5db08425063edef59a449760ac8c67c95409f742..4ff6b4cdb94190827847d5e7c4b51198c958b23d 100644 (file)
@@ -467,6 +467,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
                ptr = &remcomInBuffer[1];
                if (kgdb_hex2long(&ptr, &addr))
                        linux_regs->ip = addr;
+               /* fall through */
        case 'D':
        case 'k':
                /* clear the trace bit */
index 4c8acdfdc5a746a23bb012c7860ff0f9b3ed7d9f..ceba408ea9824ad1cd5952aa965058fd3318d69f 100644 (file)
@@ -352,6 +352,8 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+       u64 sme_mask = sme_me_mask;
+
        VMCOREINFO_NUMBER(phys_base);
        VMCOREINFO_SYMBOL(init_top_pgt);
        vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
@@ -364,6 +366,7 @@ void arch_crash_save_vmcoreinfo(void)
        vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
                              kaslr_offset());
        VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+       VMCOREINFO_NUMBER(sme_mask);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
index c91ff9f9fe8a7cde3f081698beda1ebeef9b9797..ce1a67b70168e6b354f1916a200dd031d0e488b9 100644 (file)
@@ -150,7 +150,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
  */
 static void smp_callin(void)
 {
-       int cpuid, phys_id;
+       int cpuid;
 
        /*
         * If waken up by an INIT in an 82489DX configuration
@@ -160,11 +160,6 @@ static void smp_callin(void)
         */
        cpuid = smp_processor_id();
 
-       /*
-        * (This works even if the APIC is not enabled.)
-        */
-       phys_id = read_apic_id();
-
        /*
         * the boot CPU has finished the init stage and is spinning
         * on callin_map until we finish. We are free to set up this
index e289ce1332ab4c7459b84ad6ad50f8fef940f332..d26f9e9c3d8308fbb17a8d2c826f002367b74a85 100644 (file)
@@ -881,12 +881,12 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-       unsigned long cr0;
+       unsigned long cr0 = read_cr0();
 
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
-       if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
+       if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
                struct math_emu_info info = { };
 
                cond_local_irq_enable(regs);
@@ -898,7 +898,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #endif
 
        /* This should not happen. */
-       cr0 = read_cr0();
        if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
                /* Try to fix it up and carry on. */
                write_cr0(cr0 & ~X86_CR0_TS);
index 843feb94a950168c60319e127f4df4568e1beb3c..ccf03416e434228a35ec9651b48a9524d6dab1c4 100644 (file)
@@ -745,6 +745,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
                 * OPCODE1() of the "short" jmp which checks the same condition.
                 */
                opc1 = OPCODE2(insn) - 0x10;
+               /* fall through */
        default:
                if (!is_cond_jmp_opcode(opc1))
                        return -ENOSYS;
index 0d618ee634ac40cbdd35957fcbe2f17f13446a6f..bad8c51fee6eea6be91d7a594e820470c121c2a9 100644 (file)
@@ -31,7 +31,7 @@
 
 #undef i386     /* in case the preprocessor is a 32bit one */
 
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
 
 #ifdef CONFIG_X86_32
 OUTPUT_ARCH(i386)
@@ -401,7 +401,7 @@ SECTIONS
  * Per-cpu symbols which need to be offset from __per_cpu_load
  * for the boot processor.
  */
-#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
 INIT_PER_CPU(gdt_page);
 INIT_PER_CPU(irq_stack_union);
 
index 9119d8e41f1ff59e2c8584a36f0f03d000bb1bbe..cf00ab6c662108280aadb7c8ffe719645b1699c5 100644 (file)
@@ -179,6 +179,8 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
                if (insn->addr_bytes == 2)
                        return -EINVAL;
 
+               /* fall through */
+
        case -EDOM:
        case offsetof(struct pt_regs, bx):
        case offsetof(struct pt_regs, si):
index 12d7e7fb4efdf361278fad6a5fca8c8e3f2ee850..19c6abf9ea3170217621143068961e2593fb949b 100644 (file)
@@ -52,7 +52,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
                cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 }
 
-static void percpu_setup_debug_store(int cpu)
+static void __init percpu_setup_debug_store(int cpu)
 {
 #ifdef CONFIG_CPU_SUP_INTEL
        int npages;
index e3cdc85ce5b6e06644c654c30fafebe3912e390d..ee8f8ab469417c6eb0f06aa6d4390a0eec8162b1 100644 (file)
@@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
        int i;
        pud_t *start, *pud_start;
        pgprotval_t prot, eff;
-       pud_t *prev_pud = NULL;
 
        pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
@@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
                } else
                        note_page(m, st, __pgprot(0), 0, 3);
 
-               prev_pud = start;
                start++;
        }
 }
index 999d6d8f0beff218a9e0a8dd720d0f5607ff9165..bc4bc7b2f075d3f302ba25dc261b759ab89dab97 100644 (file)
@@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                 * that UV should be updated so that smp_call_function_many(),
                 * etc, are optimal on UV.
                 */
-               unsigned int cpu;
-
-               cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, info);
                if (cpumask)
                        smp_call_function_many(cpumask, flush_tlb_func_remote,
index eb33432f2f241db7475e32e651ae2bc9ca526361..ef60d789c76ed330eb64cfaaa9c11c7a7df54873 100644 (file)
@@ -45,7 +45,7 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
         * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
         * callback method, which uses efi_call() directly, with the kernel page tables:
         */
-       if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags)))
+       if (unlikely(efi_enabled(EFI_OLD_MEMMAP)))
                ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
        else
                ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
@@ -85,18 +85,6 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
        return ret;
 }
 
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-                                       u64 a4, u64 a5)
-{
-       s64 ret;
-
-       preempt_disable();
-       ret = uv_bios_call(which, a1, a2, a3, a4, a5);
-       preempt_enable();
-
-       return ret;
-}
-
 
 long sn_partition_id;
 EXPORT_SYMBOL_GPL(sn_partition_id);
@@ -207,7 +195,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
 }
 EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
 
-#ifdef CONFIG_EFI
 void uv_bios_init(void)
 {
        uv_systab = NULL;
@@ -237,4 +224,3 @@ void uv_bios_init(void)
        }
        pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
 }
-#endif
index a4130b84d1ff5650f020802f7ebe7e990db97e71..2c53b0f19329ad9ce57156891d2e578c7e991b60 100644 (file)
@@ -2010,8 +2010,7 @@ static void make_per_cpu_thp(struct bau_control *smaster)
        int cpu;
        size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
 
-       smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
-       memset(smaster->thp, 0, hpsz);
+       smaster->thp = kzalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
        for_each_present_cpu(cpu) {
                smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
                smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
@@ -2135,15 +2134,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
 static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
 {
        unsigned char *uvhub_mask;
-       void *vp;
        struct uvhub_desc *uvhub_descs;
 
        if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
                timeout_us = calculate_destination_timeout();
 
-       vp = kmalloc_array(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
-       uvhub_descs = (struct uvhub_desc *)vp;
-       memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+       uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
        uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 
        if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
index 4463fa72db945b4c43396d347c475230fcb6431d..96cb20de08af8a61836af68bb933f517915f3d1d 100644 (file)
@@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
 targets += realmode.lds
 $(obj)/realmode.lds: $(obj)/pasyms.h
 
-LDFLAGS_realmode.elf := --emit-relocs -T
+LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
 CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
 
 targets += realmode.elf
index df8e11e26bc39bddd3003b2855b3a9d0d19b7f75..3bb980800c5811e91c88c5ea3495d55371c4ffd6 100644 (file)
@@ -9,7 +9,7 @@
 
 #undef i386
 
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_FORMAT("elf32-i386")
 OUTPUT_ARCH(i386)
 
 SECTIONS
index 963986a48c621d5d25876603551c26daaaa4829b..bacf87ee797573d13e5c9635db8e90614d4b7847 100644 (file)
@@ -5,6 +5,8 @@ config XTENSA
        select ARCH_HAS_SYNC_DMA_FOR_CPU
        select ARCH_HAS_SYNC_DMA_FOR_DEVICE
        select ARCH_NO_COHERENT_DMA_MMAP if !MMU
+       select ARCH_USE_QUEUED_RWLOCKS
+       select ARCH_USE_QUEUED_SPINLOCKS
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION
        select BUILDTIME_EXTABLE_SORT
index 809f39ce08c0779c5cd2b2b9d13c3f62ebaf6817..d939e13e8d846fe5d1a24f4948caae1ff798a36a 100644 (file)
@@ -23,6 +23,8 @@ generic-y += mm-arch-hooks.h
 generic-y += param.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
 generic-y += rwsem.h
 generic-y += sections.h
 generic-y += socket.h
index 201e9009efd81c6cd0bd97c3490931d800a39c95..22a10c715c1fd1801721803300333f6ca1dfa2eb 100644 (file)
@@ -13,6 +13,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/bits.h>
 #include <linux/stringify.h>
 
 /*
@@ -138,6 +139,28 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
 #define xchg(ptr,x) \
        ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
 
+static inline u32 xchg_small(volatile void *ptr, u32 x, int size)
+{
+       int off = (unsigned long)ptr % sizeof(u32);
+       volatile u32 *p = ptr - off;
+#ifdef __BIG_ENDIAN
+       int bitoff = (sizeof(u32) - size - off) * BITS_PER_BYTE;
+#else
+       int bitoff = off * BITS_PER_BYTE;
+#endif
+       u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;
+       u32 oldv, newv;
+       u32 ret;
+
+       do {
+               oldv = READ_ONCE(*p);
+               ret = (oldv & bitmask) >> bitoff;
+               newv = (oldv & ~bitmask) | (x << bitoff);
+       } while (__cmpxchg_u32(p, oldv, newv) != oldv);
+
+       return ret;
+}
+
 /*
  * This only works if the compiler isn't horribly bad at optimizing.
  * gcc-2.5.8 reportedly can't handle this, but I define that one to
@@ -150,11 +173,16 @@ static __inline__ unsigned long
 __xchg(unsigned long x, volatile void * ptr, int size)
 {
        switch (size) {
-               case 4:
-                       return xchg_u32(ptr, x);
+       case 1:
+               return xchg_small(ptr, x, 1);
+       case 2:
+               return xchg_small(ptr, x, 2);
+       case 4:
+               return xchg_u32(ptr, x);
+       default:
+               __xchg_called_with_bad_pointer();
+               return x;
        }
-       __xchg_called_with_bad_pointer();
-       return x;
 }
 
 #endif /* __ASSEMBLY__ */
index c6e1290dcbb7cbb8034437bc47e95d653b968bfb..584b0de6f2ca21fe1552eba57e0e73f226250208 100644 (file)
 #define _XTENSA_SPINLOCK_H
 
 #include <asm/barrier.h>
-#include <asm/processor.h>
+#include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
-/*
- * spinlock
- *
- * There is at most one owner of a spinlock.  There are not different
- * types of spinlock owners like there are for rwlocks (see below).
- *
- * When trying to obtain a spinlock, the function "spins" forever, or busy-
- * waits, until the lock is obtained.  When spinning, presumably some other
- * owner will soon give up the spinlock making it available to others.  Use
- * the trylock functions to avoid spinning forever.
- *
- * possible values:
- *
- *    0         nobody owns the spinlock
- *    1         somebody owns the spinlock
- */
-
-#define arch_spin_is_locked(x) ((x)->slock != 0)
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       wsr     %0, scompare1\n"
-                       "1:     movi    %0, 1\n"
-                       "       s32c1i  %0, %1, 0\n"
-                       "       bnez    %0, 1b\n"
-                       : "=&a" (tmp)
-                       : "a" (&lock->slock)
-                       : "memory");
-}
-
-/* Returns 1 if the lock is obtained, 0 otherwise. */
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       wsr     %0, scompare1\n"
-                       "       movi    %0, 1\n"
-                       "       s32c1i  %0, %1, 0\n"
-                       : "=&a" (tmp)
-                       : "a" (&lock->slock)
-                       : "memory");
-
-       return tmp == 0 ? 1 : 0;
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       s32ri   %0, %1, 0\n"
-                       : "=&a" (tmp)
-                       : "a" (&lock->slock)
-                       : "memory");
-}
-
-/*
- * rwlock
- *
- * Read-write locks are really a more flexible spinlock.  They allow
- * multiple readers but only one writer.  Write ownership is exclusive
- * (i.e., all other readers and writers are blocked from ownership while
- * there is a write owner).  These rwlocks are unfair to writers.  Writers
- * can be starved for an indefinite time by readers.
- *
- * possible values:
- *
- *   0          nobody owns the rwlock
- *  >0          one or more readers own the rwlock
- *                (the positive value is the actual number of readers)
- *  0x80000000  one writer owns the rwlock, no other writers, no readers
- */
-
-static inline void arch_write_lock(arch_rwlock_t *rw)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       wsr     %0, scompare1\n"
-                       "1:     movi    %0, 1\n"
-                       "       slli    %0, %0, 31\n"
-                       "       s32c1i  %0, %1, 0\n"
-                       "       bnez    %0, 1b\n"
-                       : "=&a" (tmp)
-                       : "a" (&rw->lock)
-                       : "memory");
-}
-
-/* Returns 1 if the lock is obtained, 0 otherwise. */
-
-static inline int arch_write_trylock(arch_rwlock_t *rw)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       wsr     %0, scompare1\n"
-                       "       movi    %0, 1\n"
-                       "       slli    %0, %0, 31\n"
-                       "       s32c1i  %0, %1, 0\n"
-                       : "=&a" (tmp)
-                       : "a" (&rw->lock)
-                       : "memory");
-
-       return tmp == 0 ? 1 : 0;
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *rw)
-{
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       movi    %0, 0\n"
-                       "       s32ri   %0, %1, 0\n"
-                       : "=&a" (tmp)
-                       : "a" (&rw->lock)
-                       : "memory");
-}
-
-static inline void arch_read_lock(arch_rwlock_t *rw)
-{
-       unsigned long tmp;
-       unsigned long result;
-
-       __asm__ __volatile__(
-                       "1:     l32i    %1, %2, 0\n"
-                       "       bltz    %1, 1b\n"
-                       "       wsr     %1, scompare1\n"
-                       "       addi    %0, %1, 1\n"
-                       "       s32c1i  %0, %2, 0\n"
-                       "       bne     %0, %1, 1b\n"
-                       : "=&a" (result), "=&a" (tmp)
-                       : "a" (&rw->lock)
-                       : "memory");
-}
-
-/* Returns 1 if the lock is obtained, 0 otherwise. */
-
-static inline int arch_read_trylock(arch_rwlock_t *rw)
-{
-       unsigned long result;
-       unsigned long tmp;
-
-       __asm__ __volatile__(
-                       "       l32i    %1, %2, 0\n"
-                       "       addi    %0, %1, 1\n"
-                       "       bltz    %0, 1f\n"
-                       "       wsr     %1, scompare1\n"
-                       "       s32c1i  %0, %2, 0\n"
-                       "       sub     %0, %0, %1\n"
-                       "1:\n"
-                       : "=&a" (result), "=&a" (tmp)
-                       : "a" (&rw->lock)
-                       : "memory");
-
-       return result == 0;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *rw)
-{
-       unsigned long tmp1, tmp2;
-
-       __asm__ __volatile__(
-                       "1:     l32i    %1, %2, 0\n"
-                       "       addi    %0, %1, -1\n"
-                       "       wsr     %1, scompare1\n"
-                       "       s32c1i  %0, %2, 0\n"
-                       "       bne     %0, %1, 1b\n"
-                       : "=&a" (tmp1), "=&a" (tmp2)
-                       : "a" (&rw->lock)
-                       : "memory");
-}
+#define smp_mb__after_spinlock()       smp_mb()
 
 #endif /* _XTENSA_SPINLOCK_H */
index bb1fe6c1816eb1e04733f4e4086c5e2604c1256f..64c9389254f13deb18a950b08fef786fb323da70 100644 (file)
@@ -2,20 +2,11 @@
 #ifndef __ASM_SPINLOCK_TYPES_H
 #define __ASM_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-       volatile unsigned int slock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
-
-typedef struct {
-       volatile unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED                { 0 }
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
 
 #endif
index f333f10a7650dbfe9cbd1771d307bae577104f7d..f092cc3f4e66d34f938cfa13a7e497106d0f97bf 100644 (file)
@@ -121,15 +121,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_WORK_MASK         (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \
                                 _TIF_SYSCALL_TRACEPOINT)
 
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_USEDFPU             0x0001  /* FPU was used by this task this quantum (SMP) */
-
 #define THREAD_SIZE KERNEL_STACK_SIZE
 #define THREAD_SIZE_ORDER (KERNEL_STACK_SHIFT - PAGE_SHIFT)
 
index 74969a437a37cea70c273281d068b4633364f39f..db278a9e80c7e1d3a3f527e98938f58d30cc1da2 100644 (file)
@@ -52,8 +52,6 @@
 extern void ret_from_fork(void);
 extern void ret_from_kernel_thread(void);
 
-struct task_struct *current_set[NR_CPUS] = {&init_task, };
-
 void (*pm_power_off)(void) = NULL;
 EXPORT_SYMBOL(pm_power_off);
 
@@ -321,8 +319,8 @@ unsigned long get_wchan(struct task_struct *p)
 
                /* Stack layout: sp-4: ra, sp-3: sp' */
 
-               pc = MAKE_PC_FROM_RA(*(unsigned long*)sp - 4, sp);
-               sp = *(unsigned long *)sp - 3;
+               pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), sp);
+               sp = SPILL_SLOT(sp, 1);
        } while (count++ < 16);
        return 0;
 }
index be1f280c322cd17307e0377736abdb3d955e3ebc..3699d6d3e47997d394752e1689a38d460a6eb914 100644 (file)
@@ -372,8 +372,7 @@ static void send_ipi_message(const struct cpumask *callmask,
        unsigned long mask = 0;
 
        for_each_cpu(index, callmask)
-               if (index != smp_processor_id())
-                       mask |= 1 << index;
+               mask |= 1 << index;
 
        set_er(mask, MIPISET(msg_id));
 }
@@ -412,22 +411,31 @@ irqreturn_t ipi_interrupt(int irq, void *dev_id)
 {
        unsigned int cpu = smp_processor_id();
        struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
-       unsigned int msg;
-       unsigned i;
 
-       msg = get_er(MIPICAUSE(cpu));
-       for (i = 0; i < IPI_MAX; i++)
-               if (msg & (1 << i)) {
-                       set_er(1 << i, MIPICAUSE(cpu));
-                       ++ipi->ipi_count[i];
+       for (;;) {
+               unsigned int msg;
+
+               msg = get_er(MIPICAUSE(cpu));
+               set_er(msg, MIPICAUSE(cpu));
+
+               if (!msg)
+                       break;
+
+               if (msg & (1 << IPI_CALL_FUNC)) {
+                       ++ipi->ipi_count[IPI_CALL_FUNC];
+                       generic_smp_call_function_interrupt();
                }
 
-       if (msg & (1 << IPI_RESCHEDULE))
-               scheduler_ipi();
-       if (msg & (1 << IPI_CALL_FUNC))
-               generic_smp_call_function_interrupt();
-       if (msg & (1 << IPI_CPU_STOP))
-               ipi_cpu_stop(cpu);
+               if (msg & (1 << IPI_RESCHEDULE)) {
+                       ++ipi->ipi_count[IPI_RESCHEDULE];
+                       scheduler_ipi();
+               }
+
+               if (msg & (1 << IPI_CPU_STOP)) {
+                       ++ipi->ipi_count[IPI_CPU_STOP];
+                       ipi_cpu_stop(cpu);
+               }
+       }
 
        return IRQ_HANDLED;
 }
index 378186b5eb401aba9246756b5d235b60a4d697d5..69db8c93c1f992d221f5217dec8fd6606f188a1f 100644 (file)
@@ -52,14 +52,11 @@ static struct clocksource ccount_clocksource = {
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static int ccount_timer_set_next_event(unsigned long delta,
-               struct clock_event_device *dev);
 struct ccount_timer {
        struct clock_event_device evt;
        int irq_enabled;
        char name[24];
 };
-static DEFINE_PER_CPU(struct ccount_timer, ccount_timer);
 
 static int ccount_timer_set_next_event(unsigned long delta,
                struct clock_event_device *dev)
@@ -107,7 +104,30 @@ static int ccount_timer_set_oneshot(struct clock_event_device *evt)
        return 0;
 }
 
-static irqreturn_t timer_interrupt(int irq, void *dev_id);
+static DEFINE_PER_CPU(struct ccount_timer, ccount_timer) = {
+       .evt = {
+               .features = CLOCK_EVT_FEAT_ONESHOT,
+               .rating = 300,
+               .set_next_event = ccount_timer_set_next_event,
+               .set_state_shutdown = ccount_timer_shutdown,
+               .set_state_oneshot = ccount_timer_set_oneshot,
+               .tick_resume = ccount_timer_set_oneshot,
+       },
+};
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt;
+
+       set_linux_timer(get_linux_timer());
+       evt->event_handler(evt);
+
+       /* Allow platform to do something useful (Wdog). */
+       platform_heartbeat();
+
+       return IRQ_HANDLED;
+}
+
 static struct irqaction timer_irqaction = {
        .handler =      timer_interrupt,
        .flags =        IRQF_TIMER,
@@ -120,14 +140,8 @@ void local_timer_setup(unsigned cpu)
        struct clock_event_device *clockevent = &timer->evt;
 
        timer->irq_enabled = 1;
-       clockevent->name = timer->name;
        snprintf(timer->name, sizeof(timer->name), "ccount_clockevent_%u", cpu);
-       clockevent->features = CLOCK_EVT_FEAT_ONESHOT;
-       clockevent->rating = 300;
-       clockevent->set_next_event = ccount_timer_set_next_event;
-       clockevent->set_state_shutdown = ccount_timer_shutdown;
-       clockevent->set_state_oneshot = ccount_timer_set_oneshot;
-       clockevent->tick_resume = ccount_timer_set_oneshot;
+       clockevent->name = timer->name;
        clockevent->cpumask = cpumask_of(cpu);
        clockevent->irq = irq_create_mapping(NULL, LINUX_TIMER_INT);
        if (WARN(!clockevent->irq, "error: can't map timer irq"))
@@ -190,23 +204,6 @@ void __init time_init(void)
        timer_probe();
 }
 
-/*
- * The timer interrupt is called HZ times per second.
- */
-
-irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-       struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt;
-
-       set_linux_timer(get_linux_timer());
-       evt->event_handler(evt);
-
-       /* Allow platform to do something useful (Wdog). */
-       platform_heartbeat();
-
-       return IRQ_HANDLED;
-}
-
 #ifndef CONFIG_GENERIC_CALIBRATE_DELAY
 void calibrate_delay(void)
 {
index e6fa55aa1ccb2e298b9f2bd57754bfb22a07c6c4..454d53096bc996b174d322967fa78c3ec0cc9d7f 100644 (file)
@@ -420,16 +420,15 @@ void __init trap_init(void)
        /* Setup specific handlers. */
 
        for(i = 0; dispatch_init_table[i].cause >= 0; i++) {
-
                int fast = dispatch_init_table[i].fast;
                int cause = dispatch_init_table[i].cause;
                void *handler = dispatch_init_table[i].handler;
 
                if (fast == 0)
                        set_handler(default_handler, cause, handler);
-               if (fast && fast & USER)
+               if ((fast & USER) != 0)
                        set_handler(fast_user_handler, cause, handler);
-               if (fast && fast & KRNL)
+               if ((fast & KRNL) != 0)
                        set_handler(fast_kernel_handler, cause, handler);
        }
 
index 5d28d9e454f5299bd3113c30807679cfebe2c6b6..08f4a512afad204b4798b8b6b7ee8ccff5167db1 100644 (file)
@@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
        int i, rc;
 
        pr_devel("Adapter reset request\n");
+       spin_lock(&adapter->afu_list_lock);
        for (i = 0; i < adapter->slices; i++) {
                if ((afu = adapter->afu[i])) {
                        pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
@@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
                        pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
                }
        }
+       spin_unlock(&adapter->afu_list_lock);
        return rc;
 }
 
index c79ba1c699ad1005973474b705065f4c5f8773c7..300531d6136f2efacb9964ebf2daea262a20289b 100644 (file)
@@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
        /* There should only be one entry, but go through the list
         * anyway
         */
-       if (afu->phb == NULL)
+       if (afu == NULL || afu->phb == NULL)
                return result;
 
        list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 {
        struct cxl *adapter = pci_get_drvdata(pdev);
        struct cxl_afu *afu;
-       pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
+       pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+       pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
        int i;
 
        /* At this point, we could still have an interrupt pending.
@@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 
        /* If we're permanently dead, give up. */
        if (state == pci_channel_io_perm_failure) {
+               spin_lock(&adapter->afu_list_lock);
                for (i = 0; i < adapter->slices; i++) {
                        afu = adapter->afu[i];
                        /*
@@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
                         */
                        cxl_vphb_error_detected(afu, state);
                }
+               spin_unlock(&adapter->afu_list_lock);
                return PCI_ERS_RESULT_DISCONNECT;
        }
 
@@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
         *     * In slot_reset, free the old resources and allocate new ones.
         *     * In resume, clear the flag to allow things to start.
         */
+
+       /* Make sure no one else changes the afu list */
+       spin_lock(&adapter->afu_list_lock);
+
        for (i = 0; i < adapter->slices; i++) {
                afu = adapter->afu[i];
 
-               afu_result = cxl_vphb_error_detected(afu, state);
+               if (afu == NULL)
+                       continue;
 
+               afu_result = cxl_vphb_error_detected(afu, state);
                cxl_context_detach_all(afu);
                cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
                pci_deconfigure_afu(afu);
@@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
                         (result == PCI_ERS_RESULT_NEED_RESET))
                        result = PCI_ERS_RESULT_NONE;
        }
+       spin_unlock(&adapter->afu_list_lock);
 
        /* should take the context lock here */
        if (cxl_adapter_context_lock(adapter) != 0)
@@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
         */
        cxl_adapter_context_unlock(adapter);
 
+       spin_lock(&adapter->afu_list_lock);
        for (i = 0; i < adapter->slices; i++) {
                afu = adapter->afu[i];
 
+               if (afu == NULL)
+                       continue;
+
                if (pci_configure_afu(afu, adapter, pdev))
-                       goto err;
+                       goto err_unlock;
 
                if (cxl_afu_select_best_mode(afu))
-                       goto err;
+                       goto err_unlock;
 
                if (afu->phb == NULL)
                        continue;
@@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
                        ctx = cxl_get_context(afu_dev);
 
                        if (ctx && cxl_release_context(ctx))
-                               goto err;
+                               goto err_unlock;
 
                        ctx = cxl_dev_context_init(afu_dev);
                        if (IS_ERR(ctx))
-                               goto err;
+                               goto err_unlock;
 
                        afu_dev->dev.archdata.cxl_ctx = ctx;
 
                        if (cxl_ops->afu_check_and_enable(afu))
-                               goto err;
+                               goto err_unlock;
 
                        afu_dev->error_state = pci_channel_io_normal;
 
@@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
                                result = PCI_ERS_RESULT_DISCONNECT;
                }
        }
+
+       spin_unlock(&adapter->afu_list_lock);
        return result;
 
+err_unlock:
+       spin_unlock(&adapter->afu_list_lock);
+
 err:
        /* All the bits that happen in both error_detected and cxl_remove
         * should be idempotent, so we don't need to worry about leaving a mix
@@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
         * This is not the place to be checking if everything came back up
         * properly, because there's no return value: do that in slot_reset.
         */
+       spin_lock(&adapter->afu_list_lock);
        for (i = 0; i < adapter->slices; i++) {
                afu = adapter->afu[i];
 
-               if (afu->phb == NULL)
+               if (afu == NULL || afu->phb == NULL)
                        continue;
 
                list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
                                afu_dev->driver->err_handler->resume(afu_dev);
                }
        }
+       spin_unlock(&adapter->afu_list_lock);
 }
 
 static const struct pci_error_handlers cxl_err_handler = {
index 49da2f744bbf1b8ee8a775d7192f7742188c9136..631c5df246d4384e3187a366b3c64a309ec58a5b 100644 (file)
@@ -43,8 +43,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev)
                return false;
        }
 
-       set_dma_ops(&dev->dev, &dma_nommu_ops);
-       set_dma_offset(&dev->dev, PAGE_OFFSET);
+       dev->dev.archdata.dma_offset = PAGE_OFFSET;
 
        /*
         * Allocate a context to do cxl things too.  If we eventually do real
index d21041554507139415a23b61fb1d857bb1ca862f..a5bf46310f60dc314b5ac5684f7ac9ac23ad15f1 100644 (file)
@@ -1716,6 +1716,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                err = -ENODEV;
                goto out;
        }
+       dma_set_mask(&mac->dma_pdev->dev, DMA_BIT_MASK(64));
 
        mac->iob_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa001, NULL);
        if (!mac->iob_pdev) {
index 28f87fd6a28e0ba3aa315b7c0ab0265bcbae7064..9f906a5b8e81012b56e50adb06acd46bab5e23c1 100644 (file)
@@ -66,7 +66,7 @@ static void tty_audit_log(const char *description, dev_t dev,
        uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
        unsigned int sessionid = audit_get_sessionid(current);
 
-       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY);
+       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY);
        if (ab) {
                char name[sizeof(current->comm)];
 
index 38edeb4729a9d475445bffab51c25ead9c6ae5bb..1a742fe8f6dbf4bdd4ad013b26df6f741279fc4b 100644 (file)
@@ -74,13 +74,13 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
                        ret = eeh_pe_get_state(pe);
                        break;
                case VFIO_EEH_PE_RESET_DEACTIVATE:
-                       ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
+                       ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
                        break;
                case VFIO_EEH_PE_RESET_HOT:
-                       ret = eeh_pe_reset(pe, EEH_RESET_HOT);
+                       ret = eeh_pe_reset(pe, EEH_RESET_HOT, true);
                        break;
                case VFIO_EEH_PE_RESET_FUNDAMENTAL:
-                       ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
+                       ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
                        break;
                case VFIO_EEH_PE_CONFIGURE:
                        ret = eeh_pe_configure(pe);
index 293733f61594bc073fcd98ed94bb19fd8a93da21..23fcd8c164a3579cef3e2f70a5aedf88ee404ba1 100644 (file)
@@ -12,7 +12,8 @@ obj-y :=      open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o splice.o sync.o utimes.o d_path.o \
-               stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+               stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+               fs_types.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=       buffer.o block_dev.o direct-io.o mpage.o
index 3b66c957ea6f2a02f49e8581d032cf22745ea80c..5810463dc6d210b274d85e8a8819f7579fcf4dbc 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 
 #include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
        }
 
        if (acl) {
+               unsigned int nofs_flag;
+
                size = posix_acl_xattr_size(acl->a_count);
+               /*
+                * We're holding a transaction handle, so use a NOFS memory
+                * allocation context to avoid deadlock if reclaim happens.
+                */
+               nofs_flag = memalloc_nofs_save();
                value = kmalloc(size, GFP_KERNEL);
+               memalloc_nofs_restore(nofs_flag);
                if (!value) {
                        ret = -ENOMEM;
                        goto out;
index d522494698fa4ee50b91adfe94eec125506e62d4..122cb97c79098ec10623a59d1101a191c7038a6a 100644 (file)
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
        }
 
        if (flags & WQ_HIGHPRI)
-               ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
-                                                ret->current_active, "btrfs",
-                                                name);
+               ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+                                                ret->current_active, name);
        else
-               ret->normal_wq = alloc_workqueue("%s-%s", flags,
-                                                ret->current_active, "btrfs",
-                                                name);
+               ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+                                                ret->current_active, name);
        if (!ret->normal_wq) {
                kfree(ret);
                return NULL;
index 78556447e1d5a203a2f087c3342d51cd61cc1134..11459fe84a2965f795cfe6be3f91b9805fb72354 100644 (file)
@@ -712,7 +712,7 @@ out:
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-                           struct preftrees *preftrees)
+                           struct preftrees *preftrees, bool lock)
 {
        struct prelim_ref *ref;
        struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
                        free_extent_buffer(eb);
                        return -EIO;
                }
-               btrfs_tree_read_lock(eb);
+               if (lock)
+                       btrfs_tree_read_lock(eb);
                if (btrfs_header_level(eb) == 0)
                        btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
                else
                        btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-               btrfs_tree_read_unlock(eb);
+               if (lock)
+                       btrfs_tree_read_unlock(eb);
                free_extent_buffer(eb);
                prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
                cond_resched();
@@ -1227,7 +1229,7 @@ again:
 
        btrfs_release_path(path);
 
-       ret = add_missing_keys(fs_info, &preftrees);
+       ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
        if (ret)
                goto out;
 
@@ -1288,11 +1290,15 @@ again:
                                        ret = -EIO;
                                        goto out;
                                }
-                               btrfs_tree_read_lock(eb);
-                               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+                               if (!path->skip_locking) {
+                                       btrfs_tree_read_lock(eb);
+                                       btrfs_set_lock_blocking_read(eb);
+                               }
                                ret = find_extent_in_eb(eb, bytenr,
                                                        *extent_item_pos, &eie, ignore_offset);
-                               btrfs_tree_read_unlock_blocking(eb);
+                               if (!path->skip_locking)
+                                       btrfs_tree_read_unlock_blocking(eb);
                                free_extent_buffer(eb);
                                if (ret < 0)
                                        goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                /* make sure we can use eb after releasing the path */
                if (eb != eb_in) {
                        if (!path->skip_locking)
-                               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                               btrfs_set_lock_blocking_read(eb);
                        path->nodes[0] = NULL;
                        path->locks[0] = 0;
                }
index 548057630b69e9735d75eb1426869c85c0e521e9..eb8e20b740d6e245ec303da68ab335ce21fcd14a 100644 (file)
@@ -730,6 +730,28 @@ struct heuristic_ws {
        struct list_head list;
 };
 
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+       btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+       btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+       return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+       btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
 static void free_heuristic_ws(struct list_head *ws)
 {
        struct heuristic_ws *workspace;
@@ -742,7 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
        kfree(workspace);
 }
 
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
 {
        struct heuristic_ws *ws;
 
@@ -769,65 +791,59 @@ fail:
        return ERR_PTR(-ENOMEM);
 }
 
-struct workspaces_list {
-       struct list_head idle_ws;
-       spinlock_t ws_lock;
-       /* Number of free workspaces */
-       int free_ws;
-       /* Total number of allocated workspaces */
-       atomic_t total_ws;
-       /* Waiters for a free workspace */
-       wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+       .init_workspace_manager = heuristic_init_workspace_manager,
+       .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+       .get_workspace = heuristic_get_workspace,
+       .put_workspace = heuristic_put_workspace,
+       .alloc_workspace = alloc_heuristic_ws,
+       .free_workspace = free_heuristic_ws,
 };
 
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+       /* The heuristic is represented as compression type 0 */
+       &btrfs_heuristic_compress,
        &btrfs_zlib_compress,
        &btrfs_lzo_compress,
        &btrfs_zstd_compress,
 };
 
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+                                 const struct btrfs_compress_op *ops)
 {
        struct list_head *workspace;
-       int i;
 
-       INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
-       spin_lock_init(&btrfs_heuristic_ws.ws_lock);
-       atomic_set(&btrfs_heuristic_ws.total_ws, 0);
-       init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+       wsm->ops = ops;
 
-       workspace = alloc_heuristic_ws();
+       INIT_LIST_HEAD(&wsm->idle_ws);
+       spin_lock_init(&wsm->ws_lock);
+       atomic_set(&wsm->total_ws, 0);
+       init_waitqueue_head(&wsm->ws_wait);
+
+       /*
+        * Preallocate one workspace for each compression type so we can
+        * guarantee forward progress in the worst case
+        */
+       workspace = wsm->ops->alloc_workspace(0);
        if (IS_ERR(workspace)) {
                pr_warn(
-       "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+       "BTRFS: cannot preallocate compression workspace, will try later\n");
        } else {
-               atomic_set(&btrfs_heuristic_ws.total_ws, 1);
-               btrfs_heuristic_ws.free_ws = 1;
-               list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+               atomic_set(&wsm->total_ws, 1);
+               wsm->free_ws = 1;
+               list_add(workspace, &wsm->idle_ws);
        }
+}
 
-       for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-               INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
-               spin_lock_init(&btrfs_comp_ws[i].ws_lock);
-               atomic_set(&btrfs_comp_ws[i].total_ws, 0);
-               init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+       struct list_head *ws;
 
-               /*
-                * Preallocate one workspace for each compression type so
-                * we can guarantee forward progress in the worst case
-                */
-               workspace = btrfs_compress_op[i]->alloc_workspace();
-               if (IS_ERR(workspace)) {
-                       pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
-               } else {
-                       atomic_set(&btrfs_comp_ws[i].total_ws, 1);
-                       btrfs_comp_ws[i].free_ws = 1;
-                       list_add(workspace, &btrfs_comp_ws[i].idle_ws);
-               }
+       while (!list_empty(&wsman->idle_ws)) {
+               ws = wsman->idle_ws.next;
+               list_del(ws);
+               wsman->ops->free_workspace(ws);
+               atomic_dec(&wsman->total_ws);
        }
 }
 
@@ -837,11 +853,11 @@ void __init btrfs_init_compress(void)
  * Preallocation makes a forward progress guarantees and we do not return
  * errors.
  */
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+                                     unsigned int level)
 {
        struct list_head *workspace;
        int cpus = num_online_cpus();
-       int idx = type - 1;
        unsigned nofs_flag;
        struct list_head *idle_ws;
        spinlock_t *ws_lock;
@@ -849,19 +865,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
        wait_queue_head_t *ws_wait;
        int *free_ws;
 
-       if (heuristic) {
-               idle_ws  = &btrfs_heuristic_ws.idle_ws;
-               ws_lock  = &btrfs_heuristic_ws.ws_lock;
-               total_ws = &btrfs_heuristic_ws.total_ws;
-               ws_wait  = &btrfs_heuristic_ws.ws_wait;
-               free_ws  = &btrfs_heuristic_ws.free_ws;
-       } else {
-               idle_ws  = &btrfs_comp_ws[idx].idle_ws;
-               ws_lock  = &btrfs_comp_ws[idx].ws_lock;
-               total_ws = &btrfs_comp_ws[idx].total_ws;
-               ws_wait  = &btrfs_comp_ws[idx].ws_wait;
-               free_ws  = &btrfs_comp_ws[idx].free_ws;
-       }
+       idle_ws  = &wsm->idle_ws;
+       ws_lock  = &wsm->ws_lock;
+       total_ws = &wsm->total_ws;
+       ws_wait  = &wsm->ws_wait;
+       free_ws  = &wsm->free_ws;
 
 again:
        spin_lock(ws_lock);
@@ -892,10 +900,7 @@ again:
         * context of btrfs_compress_bio/btrfs_compress_pages
         */
        nofs_flag = memalloc_nofs_save();
-       if (heuristic)
-               workspace = alloc_heuristic_ws();
-       else
-               workspace = btrfs_compress_op[idx]->alloc_workspace();
+       workspace = wsm->ops->alloc_workspace(level);
        memalloc_nofs_restore(nofs_flag);
 
        if (IS_ERR(workspace)) {
@@ -926,85 +931,47 @@ again:
        return workspace;
 }
 
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
 {
-       return __find_workspace(type, false);
+       return btrfs_compress_op[type]->get_workspace(level);
 }
 
 /*
  * put a workspace struct back on the list or free it if we have enough
  * idle ones sitting around
  */
-static void __free_workspace(int type, struct list_head *workspace,
-                            bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
 {
-       int idx = type - 1;
        struct list_head *idle_ws;
        spinlock_t *ws_lock;
        atomic_t *total_ws;
        wait_queue_head_t *ws_wait;
        int *free_ws;
 
-       if (heuristic) {
-               idle_ws  = &btrfs_heuristic_ws.idle_ws;
-               ws_lock  = &btrfs_heuristic_ws.ws_lock;
-               total_ws = &btrfs_heuristic_ws.total_ws;
-               ws_wait  = &btrfs_heuristic_ws.ws_wait;
-               free_ws  = &btrfs_heuristic_ws.free_ws;
-       } else {
-               idle_ws  = &btrfs_comp_ws[idx].idle_ws;
-               ws_lock  = &btrfs_comp_ws[idx].ws_lock;
-               total_ws = &btrfs_comp_ws[idx].total_ws;
-               ws_wait  = &btrfs_comp_ws[idx].ws_wait;
-               free_ws  = &btrfs_comp_ws[idx].free_ws;
-       }
+       idle_ws  = &wsm->idle_ws;
+       ws_lock  = &wsm->ws_lock;
+       total_ws = &wsm->total_ws;
+       ws_wait  = &wsm->ws_wait;
+       free_ws  = &wsm->free_ws;
 
        spin_lock(ws_lock);
        if (*free_ws <= num_online_cpus()) {
-               list_add(workspace, idle_ws);
+               list_add(ws, idle_ws);
                (*free_ws)++;
                spin_unlock(ws_lock);
                goto wake;
        }
        spin_unlock(ws_lock);
 
-       if (heuristic)
-               free_heuristic_ws(workspace);
-       else
-               btrfs_compress_op[idx]->free_workspace(workspace);
+       wsm->ops->free_workspace(ws);
        atomic_dec(total_ws);
 wake:
        cond_wake_up(ws_wait);
 }
 
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
 {
-       return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
-       struct list_head *workspace;
-       int i;
-
-       while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
-               workspace = btrfs_heuristic_ws.idle_ws.next;
-               list_del(workspace);
-               free_heuristic_ws(workspace);
-               atomic_dec(&btrfs_heuristic_ws.total_ws);
-       }
-
-       for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
-               while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
-                       workspace = btrfs_comp_ws[i].idle_ws.next;
-                       list_del(workspace);
-                       btrfs_compress_op[i]->free_workspace(workspace);
-                       atomic_dec(&btrfs_comp_ws[i].total_ws);
-               }
-       }
+       return btrfs_compress_op[type]->put_workspace(ws);
 }
 
 /*
@@ -1036,18 +1003,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
                         unsigned long *total_in,
                         unsigned long *total_out)
 {
+       int type = btrfs_compress_type(type_level);
+       int level = btrfs_compress_level(type_level);
        struct list_head *workspace;
        int ret;
-       int type = type_level & 0xF;
-
-       workspace = find_workspace(type);
 
-       btrfs_compress_op[type - 1]->set_level(workspace, type_level);
-       ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+       workspace = get_workspace(type, level);
+       ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
                                                      start, pages,
                                                      out_pages,
                                                      total_in, total_out);
-       free_workspace(type, workspace);
+       put_workspace(type, workspace);
        return ret;
 }
 
@@ -1071,9 +1037,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
        int ret;
        int type = cb->compress_type;
 
-       workspace = find_workspace(type);
-       ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
-       free_workspace(type, workspace);
+       workspace = get_workspace(type, 0);
+       ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+       put_workspace(type, workspace);
 
        return ret;
 }
@@ -1089,19 +1055,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
        struct list_head *workspace;
        int ret;
 
-       workspace = find_workspace(type);
-
-       ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+       workspace = get_workspace(type, 0);
+       ret = btrfs_compress_op[type]->decompress(workspace, data_in,
                                                  dest_page, start_byte,
                                                  srclen, destlen);
+       put_workspace(type, workspace);
 
-       free_workspace(type, workspace);
        return ret;
 }
 
+void __init btrfs_init_compress(void)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+               btrfs_compress_op[i]->init_workspace_manager();
+}
+
 void __cold btrfs_exit_compress(void)
 {
-       free_workspaces();
+       int i;
+
+       for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+               btrfs_compress_op[i]->cleanup_workspace_manager();
 }
 
 /*
@@ -1512,7 +1488,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
  */
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
 {
-       struct list_head *ws_list = __find_workspace(0, true);
+       struct list_head *ws_list = get_workspace(0, 0);
        struct heuristic_ws *ws;
        u32 i;
        u8 byte;
@@ -1581,18 +1557,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
        }
 
 out:
-       __free_workspace(0, ws_list, true);
+       put_workspace(0, ws_list);
        return ret;
 }
 
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
 {
-       if (strncmp(str, "zlib", 4) != 0)
+       unsigned int level = 0;
+       int ret;
+
+       if (!type)
                return 0;
 
-       /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
-       if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
-               return str[5] - '0';
+       if (str[0] == ':') {
+               ret = kstrtouint(str + 1, 10, &level);
+               if (ret)
+                       level = 0;
+       }
+
+       level = btrfs_compress_op[type]->set_level(level);
 
-       return BTRFS_ZLIB_DEFAULT_LEVEL;
+       return level;
 }
index ddda9b80bf2044edc3ae210bb401ce8e728fdbe2..9976fe0f752670923df0e4fd01fa8a2f96e6ad6d 100644 (file)
@@ -64,6 +64,16 @@ struct compressed_bio {
        u32 sums;
 };
 
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+       return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+       return ((type_level & 0xF0) >> 4);
+}
+
 void __init btrfs_init_compress(void);
 void __cold btrfs_exit_compress(void);
 
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
 
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
 
 enum btrfs_compression_type {
        BTRFS_COMPRESS_NONE  = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
        BTRFS_COMPRESS_TYPES = 3,
 };
 
+struct workspace_manager {
+       const struct btrfs_compress_op *ops;
+       struct list_head idle_ws;
+       spinlock_t ws_lock;
+       /* Number of free workspaces */
+       int free_ws;
+       /* Total number of allocated workspaces */
+       atomic_t total_ws;
+       /* Waiters for a free workspace */
+       wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+                                 const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+                                     unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
 struct btrfs_compress_op {
-       struct list_head *(*alloc_workspace)(void);
+       void (*init_workspace_manager)(void);
+
+       void (*cleanup_workspace_manager)(void);
+
+       struct list_head *(*get_workspace)(unsigned int level);
+
+       void (*put_workspace)(struct list_head *ws);
+
+       struct list_head *(*alloc_workspace)(unsigned int level);
 
        void (*free_workspace)(struct list_head *workspace);
 
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
                          unsigned long start_byte,
                          size_t srclen, size_t destlen);
 
-       void (*set_level)(struct list_head *ws, unsigned int type);
+       /*
+        * This bounds the level set by the user to be within range of a
+        * particular compression type.  It returns the level that will be used
+        * if the level is out of bounds or the default if 0 is passed in.
+        */
+       unsigned int (*set_level)(unsigned int level);
 };
 
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS    (BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
 extern const struct btrfs_compress_op btrfs_zstd_compress;
index 5a6c39b44c84f4c2f4e39e09797d213ea0f564e7..324df36d28bf7d8da2b7f66186819a6e0a92968f 100644 (file)
@@ -13,6 +13,7 @@
 #include "print-tree.h"
 #include "locking.h"
 #include "volumes.h"
+#include "qgroup.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, int level);
@@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
                if (!p->nodes[i] || !p->locks[i])
                        continue;
-               btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
-               if (p->locks[i] == BTRFS_READ_LOCK)
+               /*
+                * If we currently have a spinning reader or writer lock this
+                * will bump the count of blocking holders and drop the
+                * spinlock.
+                */
+               if (p->locks[i] == BTRFS_READ_LOCK) {
+                       btrfs_set_lock_blocking_read(p->nodes[i]);
                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
-               else if (p->locks[i] == BTRFS_WRITE_LOCK)
+               } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+                       btrfs_set_lock_blocking_write(p->nodes[i]);
                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+               }
        }
 }
 
@@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                return eb;
 
        btrfs_set_path_blocking(path);
-       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+       btrfs_set_lock_blocking_read(eb);
 
        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                BUG_ON(tm->slot != 0);
@@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                free_extent_buffer(eb_root);
                eb = alloc_dummy_extent_buffer(fs_info, logical);
        } else {
-               btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+               btrfs_set_lock_blocking_read(eb_root);
                eb = btrfs_clone_extent_buffer(eb_root);
                btrfs_tree_read_unlock_blocking(eb_root);
                free_extent_buffer(eb_root);
@@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        search_start = buf->start & ~((u64)SZ_1G - 1);
 
        if (parent)
-               btrfs_set_lock_blocking(parent);
-       btrfs_set_lock_blocking(buf);
+               btrfs_set_lock_blocking_write(parent);
+       btrfs_set_lock_blocking_write(buf);
 
+       /*
+        * Before CoWing this block for later modification, check if it's
+        * the subtree root and do the delayed subtree trace if needed.
+        *
+        * Also We don't care about the error, as it's handled internally.
+        */
+       btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
        ret = __btrfs_cow_block(trans, root, buf, parent,
                                 parent_slot, cow_ret, search_start, 0);
 
@@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        if (parent_nritems <= 1)
                return 0;
 
-       btrfs_set_lock_blocking(parent);
+       btrfs_set_lock_blocking_write(parent);
 
        for (i = start_slot; i <= end_slot; i++) {
                struct btrfs_key first_key;
@@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        search_start = last_block;
 
                btrfs_tree_lock(cur);
-               btrfs_set_lock_blocking(cur);
+               btrfs_set_lock_blocking_write(cur);
                err = __btrfs_cow_block(trans, root, cur, parent, i,
                                        &cur, search_start,
                                        min(16 * blocksize,
@@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                }
 
                btrfs_tree_lock(child);
-               btrfs_set_lock_blocking(child);
+               btrfs_set_lock_blocking_write(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
                if (ret) {
                        btrfs_tree_unlock(child);
@@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
        if (left) {
                btrfs_tree_lock(left);
-               btrfs_set_lock_blocking(left);
+               btrfs_set_lock_blocking_write(left);
                wret = btrfs_cow_block(trans, root, left,
                                       parent, pslot - 1, &left);
                if (wret) {
@@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
        if (right) {
                btrfs_tree_lock(right);
-               btrfs_set_lock_blocking(right);
+               btrfs_set_lock_blocking_write(right);
                wret = btrfs_cow_block(trans, root, right,
                                       parent, pslot + 1, &right);
                if (wret) {
@@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                u32 left_nr;
 
                btrfs_tree_lock(left);
-               btrfs_set_lock_blocking(left);
+               btrfs_set_lock_blocking_write(left);
 
                left_nr = btrfs_header_nritems(left);
                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                u32 right_nr;
 
                btrfs_tree_lock(right);
-               btrfs_set_lock_blocking(right);
+               btrfs_set_lock_blocking_write(right);
 
                right_nr = btrfs_header_nritems(right);
                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2529,26 +2544,6 @@ done:
        return ret;
 }
 
-static void key_search_validate(struct extent_buffer *b,
-                               const struct btrfs_key *key,
-                               int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
-       struct btrfs_disk_key disk_key;
-
-       btrfs_cpu_key_to_disk(&disk_key, key);
-
-       if (level == 0)
-               ASSERT(!memcmp_extent_buffer(b, &disk_key,
-                   offsetof(struct btrfs_leaf, items[0].key),
-                   sizeof(disk_key)));
-       else
-               ASSERT(!memcmp_extent_buffer(b, &disk_key,
-                   offsetof(struct btrfs_node, ptrs[0].key),
-                   sizeof(disk_key)));
-#endif
-}
-
 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
                      int level, int *prev_cmp, int *slot)
 {
@@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
                return *prev_cmp;
        }
 
-       key_search_validate(b, key, level);
        *slot = 0;
 
        return 0;
@@ -3005,6 +2999,8 @@ again:
                 */
                prev_cmp = -1;
                ret = key_search(b, key, level, &prev_cmp, &slot);
+               if (ret < 0)
+                       goto done;
 
                if (level != 0) {
                        int dec = 0;
@@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
                return 1;
 
        btrfs_tree_lock(right);
-       btrfs_set_lock_blocking(right);
+       btrfs_set_lock_blocking_write(right);
 
        free_space = btrfs_leaf_free_space(fs_info, right);
        if (free_space < data_size)
@@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
                return 1;
 
        btrfs_tree_lock(left);
-       btrfs_set_lock_blocking(left);
+       btrfs_set_lock_blocking_write(left);
 
        free_space = btrfs_leaf_free_space(fs_info, left);
        if (free_space < data_size) {
@@ -5156,6 +5152,10 @@ again:
                nritems = btrfs_header_nritems(cur);
                level = btrfs_header_level(cur);
                sret = btrfs_bin_search(cur, min_key, level, &slot);
+               if (sret < 0) {
+                       ret = sret;
+                       goto out;
+               }
 
                /* at the lowest level, we're done, setup the path and exit */
                if (level == path->lowest_level) {
index 94618a028730578112d169c4468be610ed7d55c3..129d26226e70fcab3788d7f0f4478a54a6408e76 100644 (file)
@@ -934,7 +934,8 @@ struct btrfs_fs_info {
 
        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
-       struct mutex cleaner_delayed_iput_mutex;
+       atomic_t nr_delayed_iputs;
+       wait_queue_head_t delayed_iputs_wait;
 
        /* this protects tree_mod_seq_list */
        spinlock_t tree_mod_seq_lock;
@@ -1074,10 +1075,13 @@ struct btrfs_fs_info {
        atomic_t scrubs_paused;
        atomic_t scrub_cancel_req;
        wait_queue_head_t scrub_pause_wait;
-       int scrub_workers_refcnt;
+       /*
+        * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+        * running.
+        */
+       refcount_t scrub_workers_refcnt;
        struct btrfs_workqueue *scrub_workers;
        struct btrfs_workqueue *scrub_wr_completion_workers;
-       struct btrfs_workqueue *scrub_nocow_workers;
        struct btrfs_workqueue *scrub_parity_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1199,6 +1203,24 @@ enum {
        BTRFS_ROOT_MULTI_LOG_TASKS,
        BTRFS_ROOT_DIRTY,
        BTRFS_ROOT_DELETING,
+
+       /*
+        * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+        *
+        * Set for the subvolume tree owning the reloc tree.
+        */
+       BTRFS_ROOT_DEAD_RELOC_TREE,
+};
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+       spinlock_t lock;
+       /* RM_EMPTY_ROOT() of above blocks[] */
+       bool swapped;
+       struct rb_root blocks[BTRFS_MAX_LEVEL];
 };
 
 /*
@@ -1311,6 +1333,14 @@ struct btrfs_root {
        struct list_head ordered_root;
        u64 nr_ordered_extents;
 
+       /*
+        * Not empty if this subvolume root has gone through tree block swap
+        * (relocation)
+        *
+        * Will be used by reloc_control::dirty_subvol_roots.
+        */
+       struct list_head reloc_dirty_list;
+
        /*
         * Number of currently running SEND ioctls to prevent
         * manipulation with the read-only status via SUBVOL_SETFLAGS
@@ -1328,6 +1358,9 @@ struct btrfs_root {
        /* Number of active swapfiles */
        atomic_t nr_swapfiles;
 
+       /* Record pairs of swapped blocks for qgroup */
+       struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        u64 alloc_bytenr;
 #endif
@@ -2775,7 +2808,8 @@ enum btrfs_flush_state {
        FLUSH_DELALLOC          =       5,
        FLUSH_DELALLOC_WAIT     =       6,
        ALLOC_CHUNK             =       7,
-       COMMIT_TRANS            =       8,
+       ALLOC_CHUNK_FORCE       =       8,
+       COMMIT_TRANS            =       9,
 };
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -3181,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 
 /* inode.c */
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-               struct page *page, size_t pg_offset, u64 start,
-               u64 len, int create);
+                                          u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                              u64 *orig_start, u64 *orig_block_len,
                              u64 *ram_bytes);
@@ -3254,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
@@ -3261,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    struct btrfs_trans_handle *trans, int mode,
                                    u64 start, u64 num_bytes, u64 min_size,
                                    loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
                u64 start, u64 end, int *page_started, unsigned long *nr_written,
                struct writeback_control *wbc);
 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -3476,21 +3510,18 @@ do {                                                            \
        rcu_read_unlock();                                      \
 } while (0)
 
-#ifdef CONFIG_BTRFS_ASSERT
-
 __cold
 static inline void assfail(const char *expr, const char *file, int line)
 {
-       pr_err("assertion failed: %s, file: %s, line: %d\n",
-              expr, file, line);
-       BUG();
+       if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+               pr_err("assertion failed: %s, file: %s, line: %d\n",
+                      expr, file, line);
+               BUG();
+       }
 }
 
 #define ASSERT(expr)   \
        (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#else
-#define ASSERT(expr)   ((void)0)
-#endif
 
 /*
  * Use that for functions that are conditionally exported for sanity tests but
index cad36c99a483ca8f6c508c32c3ad2ce94289f114..7d2a413df90d57f232c5742652a3d22b902291de 100644 (file)
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
        RB_CLEAR_NODE(&head_ref->href_node);
        head_ref->processing = 0;
        head_ref->total_ref_mod = count_mod;
-       head_ref->qgroup_reserved = 0;
-       head_ref->qgroup_ref_root = 0;
        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);
 
        if (qrecord) {
                if (ref_root && reserved) {
-                       head_ref->qgroup_ref_root = ref_root;
-                       head_ref->qgroup_reserved = reserved;
+                       qrecord->data_rsv = reserved;
+                       qrecord->data_rsv_refroot = ref_root;
                }
-
                qrecord->bytenr = bytenr;
                qrecord->num_bytes = num_bytes;
                qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
        existing = htree_insert(&delayed_refs->href_root,
                                &head_ref->href_node);
        if (existing) {
-               WARN_ON(qrecord && head_ref->qgroup_ref_root
-                       && head_ref->qgroup_reserved
-                       && existing->qgroup_ref_root
-                       && existing->qgroup_reserved);
                update_existing_head_ref(trans, existing, head_ref,
                                         old_ref_mod);
                /*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
            is_fstree(ref_root)) {
-               record = kmalloc(sizeof(*record), GFP_NOFS);
+               record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
                        kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
            is_fstree(ref_root)) {
-               record = kmalloc(sizeof(*record), GFP_NOFS);
+               record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
                        kmem_cache_free(btrfs_delayed_ref_head_cachep,
index d2af974f68a1ac2b00c157f8297c33e451fe8d14..70606da440aa7ff8d9d0065229843a98edc6453d 100644 (file)
@@ -102,17 +102,6 @@ struct btrfs_delayed_ref_head {
         */
        int ref_mod;
 
-       /*
-        * For qgroup reserved space freeing.
-        *
-        * ref_root and reserved will be recorded after
-        * BTRFS_ADD_DELAYED_EXTENT is called.
-        * And will be used to free reserved qgroup space at
-        * run_delayed_refs() time.
-        */
-       u64 qgroup_ref_root;
-       u64 qgroup_reserved;
-
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
index 8750c835f53548963ad5cad5411cdbfbc3a6817b..ee193c5222b2cd9c86e379fe98c2bd1918dbf92b 100644 (file)
@@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found:
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-               dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
-                                                       NULL, NULL);
-               dev_replace->tgtdev = btrfs_find_device(fs_info,
+               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+                                               src_devid, NULL, NULL, true);
+               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
                                                        BTRFS_DEV_REPLACE_DEVID,
-                                                       NULL, NULL);
+                                                       NULL, NULL, true);
                /*
                 * allow 'btrfs dev replace_cancel' if src/tgt device is
                 * missing
@@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
                        btrfs_destroy_dev_replace_tgtdev(tgt_device);
                break;
        default:
+               up_write(&dev_replace->rwsem);
                result = -EINVAL;
        }
 
index 6a2a2a9517058b429b286557fd0a2b1a512c833d..5216e7b3f9ada29308a1c6fc5cf75b816032aded 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
 #include <linux/crc32c.h>
+#include <linux/sched/mm.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 
        if (need_lock) {
                btrfs_tree_read_lock(eb);
-               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+               btrfs_set_lock_blocking_read(eb);
        }
 
        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -1120,7 +1121,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
                                                 -buf->len,
                                                 fs_info->dirty_metadata_batch);
                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
-                       btrfs_set_lock_blocking(buf);
+                       btrfs_set_lock_blocking_write(buf);
                        clear_extent_buffer_dirty(buf);
                }
        }
@@ -1175,6 +1176,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        INIT_LIST_HEAD(&root->delalloc_root);
        INIT_LIST_HEAD(&root->ordered_extents);
        INIT_LIST_HEAD(&root->ordered_root);
+       INIT_LIST_HEAD(&root->reloc_dirty_list);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->inode_lock);
@@ -1218,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        root->anon_dev = 0;
 
        spin_lock_init(&root->root_item_lock);
+       btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
 }
 
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1258,10 +1261,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root;
        struct btrfs_key key;
+       unsigned int nofs_flag;
        int ret = 0;
        uuid_le uuid = NULL_UUID_LE;
 
+       /*
+        * We're holding a transaction handle, so use a NOFS memory allocation
+        * context to avoid deadlock if reclaim happens.
+        */
+       nofs_flag = memalloc_nofs_save();
        root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+       memalloc_nofs_restore(nofs_flag);
        if (!root)
                return ERR_PTR(-ENOMEM);
 
@@ -1707,9 +1717,7 @@ static int cleaner_kthread(void *arg)
                        goto sleep;
                }
 
-               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                btrfs_run_delayed_iputs(fs_info);
-               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
 
                again = btrfs_clean_one_deleted_snapshot(root);
                mutex_unlock(&fs_info->cleaner_mutex);
@@ -2101,7 +2109,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
-       fs_info->scrub_workers_refcnt = 0;
+       refcount_set(&fs_info->scrub_workers_refcnt, 0);
 }
 
 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2666,7 +2674,6 @@ int open_ctree(struct super_block *sb,
        mutex_init(&fs_info->delete_unused_bgs_mutex);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
-       mutex_init(&fs_info->cleaner_delayed_iput_mutex);
        seqlock_init(&fs_info->profiles_lock);
 
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2688,6 +2695,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->defrag_running, 0);
        atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic_set(&fs_info->reada_works_cnt, 0);
+       atomic_set(&fs_info->nr_delayed_iputs, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2765,6 +2773,7 @@ int open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
+       init_waitqueue_head(&fs_info->delayed_iputs_wait);
 
        INIT_LIST_HEAD(&fs_info->pinned_chunks);
 
@@ -4238,16 +4247,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
                head = rb_entry(node, struct btrfs_delayed_ref_head,
                                href_node);
-               if (!mutex_trylock(&head->mutex)) {
-                       refcount_inc(&head->refs);
-                       spin_unlock(&delayed_refs->lock);
-
-                       mutex_lock(&head->mutex);
-                       mutex_unlock(&head->mutex);
-                       btrfs_put_delayed_ref_head(head);
-                       spin_lock(&delayed_refs->lock);
+               if (btrfs_delayed_ref_lock(delayed_refs, head))
                        continue;
-               }
+
                spin_lock(&head->lock);
                while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
                        ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4263,12 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                if (head->must_insert_reserved)
                        pin_bytes = true;
                btrfs_free_delayed_extent_op(head->extent_op);
-               delayed_refs->num_heads--;
-               if (head->processing == 0)
-                       delayed_refs->num_heads_ready--;
-               atomic_dec(&delayed_refs->num_entries);
-               rb_erase_cached(&head->href_node, &delayed_refs->href_root);
-               RB_CLEAR_NODE(&head->href_node);
+               btrfs_delete_ref_head(delayed_refs, head);
                spin_unlock(&head->lock);
                spin_unlock(&delayed_refs->lock);
                mutex_unlock(&head->mutex);
index d81035b7ea7d597e229975691fef48f8813a908a..994f0cc41799304581207b23570d68ed44354f94 100644 (file)
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
                }
        }
 
-       /* Also free its reserved qgroup space */
-       btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
-                                     head->qgroup_reserved);
        btrfs_delayed_refs_rsv_release(fs_info, nr_items);
 }
 
@@ -3013,8 +3010,7 @@ again:
        }
 
        if (run_all) {
-               if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans);
+               btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
                node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4276,14 @@ commit_trans:
                                /*
                                 * The cleaner kthread might still be doing iput
                                 * operations. Wait for it to finish so that
-                                * more space is released.
+                                * more space is released.  We don't need to
+                                * explicitly run the delayed iputs here because
+                                * the commit_transaction would have woken up
+                                * the cleaner.
                                 */
-                               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
-                               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+                               ret = btrfs_wait_on_delayed_iputs(fs_info);
+                               if (ret)
+                                       return ret;
                                goto again;
                        } else {
                                btrfs_end_transaction(trans);
@@ -4396,21 +4396,12 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
                              struct btrfs_space_info *sinfo, int force)
 {
-       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 bytes_used = btrfs_space_info_used(sinfo, false);
        u64 thresh;
 
        if (force == CHUNK_ALLOC_FORCE)
                return 1;
 
-       /*
-        * We need to take into account the global rsv because for all intents
-        * and purposes it's used space.  Don't worry about locking the
-        * global_rsv, it doesn't change except when the transaction commits.
-        */
-       if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-               bytes_used += calc_global_rsv_need_space(global_rsv);
-
        /*
         * in limited mode, we want to have some free space up to
         * about 1% of the FS size.
@@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
-       u64 max_reclaim;
+       u64 async_pages;
        u64 items;
        long time_left;
        unsigned long nr_pages;
@@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 
        loops = 0;
        while (delalloc_bytes && loops < 3) {
-               max_reclaim = min(delalloc_bytes, to_reclaim);
-               nr_pages = max_reclaim >> PAGE_SHIFT;
+               nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+               /*
+                * Triggers inode writeback for up to nr_pages. This will invoke
+                * ->writepages callback and trigger delalloc filling
+                *  (btrfs_run_delalloc_range()).
+                */
                btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
                /*
-                * We need to wait for the async pages to actually start before
-                * we do anything.
+                * We need to wait for the compressed pages to start before
+                * we continue.
                 */
-               max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
-               if (!max_reclaim)
+               async_pages = atomic_read(&fs_info->async_delalloc_pages);
+               if (!async_pages)
                        goto skip_async;
 
-               if (max_reclaim <= nr_pages)
-                       max_reclaim = 0;
+               /*
+                * Calculate how many compressed pages we want to be written
+                * before we continue. I.e if there are more async pages than we
+                * require wait_event will wait until nr_pages are written.
+                */
+               if (async_pages <= nr_pages)
+                       async_pages = 0;
                else
-                       max_reclaim -= nr_pages;
+                       async_pages -= nr_pages;
 
                wait_event(fs_info->async_submit_wait,
                           atomic_read(&fs_info->async_delalloc_pages) <=
-                          (int)max_reclaim);
+                          (int)async_pages);
 skip_async:
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
@@ -4808,6 +4810,7 @@ skip_async:
 }
 
 struct reserve_ticket {
+       u64 orig_bytes;
        u64 bytes;
        int error;
        struct list_head list;
@@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
        if (!bytes_needed)
                return 0;
 
-       /* See if there is enough pinned space to make this reservation */
-       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
-                                  bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       /*
+        * See if there is enough pinned space to make this reservation, or if
+        * we have block groups that are going to be freed, allowing us to
+        * possibly do a chunk allocation the next loop through.
+        */
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+           __percpu_counter_compare(&space_info->total_bytes_pinned,
+                                    bytes_needed,
+                                    BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
                goto commit;
 
        /*
@@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
         * this reservation.
         */
        if (space_info != delayed_rsv->space_info)
-               return -ENOSPC;
+               goto enospc;
 
        spin_lock(&delayed_rsv->lock);
        reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 
        if (__percpu_counter_compare(&space_info->total_bytes_pinned,
                                   bytes_needed,
-                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
-               return -ENOSPC;
-       }
+                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+               goto enospc;
 
 commit:
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return -ENOSPC;
-
        return btrfs_commit_transaction(trans);
+enospc:
+       btrfs_end_transaction(trans);
+       return -ENOSPC;
 }
 
 /*
@@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                btrfs_end_transaction(trans);
                break;
        case ALLOC_CHUNK:
+       case ALLOC_CHUNK_FORCE:
                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
@@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                }
                ret = do_chunk_alloc(trans,
                                     btrfs_metadata_alloc_profile(fs_info),
-                                    CHUNK_ALLOC_NO_FORCE);
+                                    (state == ALLOC_CHUNK) ?
+                                     CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
                btrfs_end_transaction(trans);
                if (ret > 0 || ret == -ENOSPC)
                        ret = 0;
@@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                 * bunch of pinned space, so make sure we run the iputs before
                 * we do our pinned bytes check below.
                 */
-               mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                btrfs_run_delayed_iputs(fs_info);
-               mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+               btrfs_wait_on_delayed_iputs(fs_info);
 
                ret = may_commit_transaction(fs_info, space_info);
                break;
@@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
 {
        struct reserve_ticket *ticket;
 
@@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
                list_del_init(&ticket->list);
                ticket->error = -ENOSPC;
                wake_up(&ticket->wait);
+               if (ticket->bytes != ticket->orig_bytes)
+                       return true;
        }
+       return false;
 }
 
 /*
@@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                                commit_cycles--;
                }
 
+               /*
+                * We don't want to force a chunk allocation until we've tried
+                * pretty hard to reclaim space.  Think of the case where we
+                * freed up a bunch of space and so have a lot of pinned space
+                * to reclaim.  We would rather use that than possibly create a
+                * underutilized metadata chunk.  So if this is our first run
+                * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+                * commit the transaction.  If nothing has changed the next go
+                * around then we can force a chunk allocation.
+                */
+               if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+                       flush_state++;
+
                if (flush_state > COMMIT_TRANS) {
                        commit_cycles++;
                        if (commit_cycles > 2) {
-                               wake_all_tickets(&space_info->tickets);
-                               space_info->flush = 0;
+                               if (wake_all_tickets(&space_info->tickets)) {
+                                       flush_state = FLUSH_DELAYED_ITEMS_NR;
+                                       commit_cycles--;
+                               } else {
+                                       space_info->flush = 0;
+                               }
                        } else {
                                flush_state = FLUSH_DELAYED_ITEMS_NR;
                        }
@@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
 }
 
+static const enum btrfs_flush_state priority_flush_states[] = {
+       FLUSH_DELAYED_ITEMS_NR,
+       FLUSH_DELAYED_ITEMS,
+       ALLOC_CHUNK,
+};
+
 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
                                            struct btrfs_space_info *space_info,
                                            struct reserve_ticket *ticket)
 {
        u64 to_reclaim;
-       int flush_state = FLUSH_DELAYED_ITEMS_NR;
+       int flush_state;
 
        spin_lock(&space_info->lock);
        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
        }
        spin_unlock(&space_info->lock);
 
+       flush_state = 0;
        do {
-               flush_space(fs_info, space_info, to_reclaim, flush_state);
+               flush_space(fs_info, space_info, to_reclaim,
+                           priority_flush_states[flush_state]);
                flush_state++;
                spin_lock(&space_info->lock);
                if (ticket->bytes == 0) {
@@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
                        return;
                }
                spin_unlock(&space_info->lock);
-
-               /*
-                * Priority flushers can't wait on delalloc without
-                * deadlocking.
-                */
-               if (flush_state == FLUSH_DELALLOC ||
-                   flush_state == FLUSH_DELALLOC_WAIT)
-                       flush_state = ALLOC_CHUNK;
-       } while (flush_state < COMMIT_TRANS);
+       } while (flush_state < ARRAY_SIZE(priority_flush_states));
 }
 
 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
                               struct btrfs_space_info *space_info,
-                              struct reserve_ticket *ticket, u64 orig_bytes)
+                              struct reserve_ticket *ticket)
 
 {
        DEFINE_WAIT(wait);
+       u64 reclaim_bytes = 0;
        int ret = 0;
 
        spin_lock(&space_info->lock);
@@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
                ret = ticket->error;
        if (!list_empty(&ticket->list))
                list_del_init(&ticket->list);
-       if (ticket->bytes && ticket->bytes < orig_bytes) {
-               u64 num_bytes = orig_bytes - ticket->bytes;
-               update_bytes_may_use(space_info, -num_bytes);
-               trace_btrfs_space_reservation(fs_info, "space_info",
-                                             space_info->flags, num_bytes, 0);
-       }
+       if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+               reclaim_bytes = ticket->orig_bytes - ticket->bytes;
        spin_unlock(&space_info->lock);
 
+       if (reclaim_bytes)
+               space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
        return ret;
 }
 
@@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 {
        struct reserve_ticket ticket;
        u64 used;
+       u64 reclaim_bytes = 0;
        int ret = 0;
 
        ASSERT(orig_bytes);
@@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
         * the list and we will do our own flushing further down.
         */
        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+               ticket.orig_bytes = orig_bytes;
                ticket.bytes = orig_bytes;
                ticket.error = 0;
                init_waitqueue_head(&ticket.wait);
@@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
                return ret;
 
        if (flush == BTRFS_RESERVE_FLUSH_ALL)
-               return wait_reserve_ticket(fs_info, space_info, &ticket,
-                                          orig_bytes);
+               return wait_reserve_ticket(fs_info, space_info, &ticket);
 
        ret = 0;
        priority_reclaim_metadata_space(fs_info, space_info, &ticket);
        spin_lock(&space_info->lock);
        if (ticket.bytes) {
-               if (ticket.bytes < orig_bytes) {
-                       u64 num_bytes = orig_bytes - ticket.bytes;
-                       update_bytes_may_use(space_info, -num_bytes);
-                       trace_btrfs_space_reservation(fs_info, "space_info",
-                                                     space_info->flags,
-                                                     num_bytes, 0);
-
-               }
+               if (ticket.bytes < orig_bytes)
+                       reclaim_bytes = orig_bytes - ticket.bytes;
                list_del_init(&ticket.list);
                ret = -ENOSPC;
        }
        spin_unlock(&space_info->lock);
+
+       if (reclaim_bytes)
+               space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
        ASSERT(list_empty(&ticket.list));
        return ret;
 }
@@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
 
+static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+       *metadata_bytes = 0;
+       *qgroup_bytes = 0;
+
+       spin_lock(&block_rsv->lock);
+       if (block_rsv->reserved < block_rsv->size)
+               *metadata_bytes = block_rsv->size - block_rsv->reserved;
+       if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+               *qgroup_bytes = block_rsv->qgroup_rsv_size -
+                       block_rsv->qgroup_rsv_reserved;
+       spin_unlock(&block_rsv->lock);
+}
+
 /**
  * btrfs_inode_rsv_refill - refill the inode block rsv.
  * @inode - the inode we are refilling.
@@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 {
        struct btrfs_root *root = inode->root;
        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-       u64 num_bytes = 0;
-       u64 qgroup_num_bytes = 0;
+       u64 num_bytes, last = 0;
+       u64 qgroup_num_bytes;
        int ret = -ENOSPC;
 
-       spin_lock(&block_rsv->lock);
-       if (block_rsv->reserved < block_rsv->size)
-               num_bytes = block_rsv->size - block_rsv->reserved;
-       if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
-               qgroup_num_bytes = block_rsv->qgroup_rsv_size -
-                                  block_rsv->qgroup_rsv_reserved;
-       spin_unlock(&block_rsv->lock);
-
+       calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
        if (num_bytes == 0)
                return 0;
 
-       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
-       if (ret)
-               return ret;
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+       do {
+               ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
+                                                        true);
+               if (ret)
+                       return ret;
+               ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+               if (ret) {
+                       btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+                       last = num_bytes;
+                       /*
+                        * If we are fragmented we can end up with a lot of
+                        * outstanding extents which will make our size be much
+                        * larger than our reserved amount.
+                        *
+                        * If the reservation happens here, it might be very
+                        * big though not needed in the end, if the delalloc
+                        * flushing happens.
+                        *
+                        * If this is the case try and do the reserve again.
+                        */
+                       if (flush == BTRFS_RESERVE_FLUSH_ALL)
+                               calc_refill_bytes(block_rsv, &num_bytes,
+                                                  &qgroup_num_bytes);
+                       if (num_bytes == 0)
+                               return 0;
+               }
+       } while (ret && last != num_bytes);
+
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, false);
                trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
                spin_lock(&block_rsv->lock);
                block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
                spin_unlock(&block_rsv->lock);
-       } else
-               btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+       }
        return ret;
 }
 
@@ -8066,6 +8125,15 @@ loop:
        return ret;
 }
 
+#define DUMP_BLOCK_RSV(fs_info, rsv_name)                              \
+do {                                                                   \
+       struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
+       spin_lock(&__rsv->lock);                                        \
+       btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
+                  __rsv->size, __rsv->reserved);                       \
+       spin_unlock(&__rsv->lock);                                      \
+} while (0)
+
 static void dump_space_info(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
@@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
                info->bytes_readonly);
        spin_unlock(&info->lock);
 
+       DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
        if (!dump_block_groups)
                return;
 
@@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        clean_tree_block(fs_info, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
-       btrfs_set_lock_blocking(buf);
+       btrfs_set_lock_blocking_write(buf);
        set_extent_buffer_uptodate(buf);
 
        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8917,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                reada = 1;
        }
        btrfs_tree_lock(next);
-       btrfs_set_lock_blocking(next);
+       btrfs_set_lock_blocking_write(next);
 
        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
                                       &wc->refs[level - 1],
@@ -8977,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                        return -EIO;
                }
                btrfs_tree_lock(next);
-               btrfs_set_lock_blocking(next);
+               btrfs_set_lock_blocking_write(next);
        }
 
        level--;
@@ -9089,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                if (!path->locks[level]) {
                        BUG_ON(level == 0);
                        btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
                        ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                if (!path->locks[level] &&
                    btrfs_header_generation(eb) == trans->transid) {
                        btrfs_tree_lock(eb);
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
                clean_tree_block(fs_info, eb);
@@ -9298,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
-               btrfs_set_lock_blocking(path->nodes[level]);
+               btrfs_set_lock_blocking_write(path->nodes[level]);
                path->slots[level] = 0;
                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                memset(&wc->update_progress, 0,
@@ -9328,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                level = btrfs_header_level(root->node);
                while (1) {
                        btrfs_tree_lock(path->nodes[level]);
-                       btrfs_set_lock_blocking(path->nodes[level]);
+                       btrfs_set_lock_blocking_write(path->nodes[level]);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
                        ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9595,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
+       u64 sinfo_used;
        u64 min_allocable_bytes;
        int ret = -ENOSPC;
 
@@ -9621,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 
        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+       sinfo_used = btrfs_space_info_used(sinfo, true);
 
-       if (btrfs_space_info_used(sinfo, true) + num_bytes +
-           min_allocable_bytes <= sinfo->total_bytes) {
+       if (sinfo_used + num_bytes + min_allocable_bytes <=
+           sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 out:
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
+       if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+               btrfs_info(cache->fs_info,
+                       "unable to make block group %llu ro",
+                       cache->key.objectid);
+               btrfs_info(cache->fs_info,
+                       "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+                       sinfo_used, num_bytes, min_allocable_bytes);
+               dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+       }
        return ret;
 }
 
@@ -10781,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        }
 
        spin_lock(&trans->transaction->dirty_bgs_lock);
-       if (!list_empty(&block_group->dirty_list)) {
-               WARN_ON(1);
-       }
-       if (!list_empty(&block_group->io_list)) {
-               WARN_ON(1);
-       }
+       WARN_ON(!list_empty(&block_group->dirty_list));
+       WARN_ON(!list_empty(&block_group->io_list));
        spin_unlock(&trans->transaction->dirty_bgs_lock);
+
        btrfs_remove_free_space_cache(block_group);
 
        spin_lock(&block_group->space_info->lock);
index 52abe408268088575db22ce79b851ee2f529a0c0..ca259c75bbcd1a32f462cfada377362ddfa2c8ca 100644 (file)
@@ -147,7 +147,38 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
        return ret;
 }
 
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+                                      unsigned long bio_flags)
+{
+       blk_status_t ret = 0;
+       struct bio_vec *bvec = bio_last_bvec_all(bio);
+       struct page *page = bvec->bv_page;
+       struct extent_io_tree *tree = bio->bi_private;
+       u64 start;
+
+       start = page_offset(page) + bvec->bv_offset;
+
+       bio->bi_private = NULL;
+
+       if (tree->ops)
+               ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+                                          mirror_num, bio_flags, start);
+       else
+               btrfsic_submit_bio(bio);
+
+       return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+       if (epd->bio) {
+               int ret;
+
+               ret = submit_one_bio(epd->bio, 0, 0);
+               BUG_ON(ret < 0); /* -ENOMEM */
+               epd->bio = NULL;
+       }
+}
 
 int __init extent_io_init(void)
 {
@@ -281,8 +312,8 @@ do_insert:
 }
 
 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
-                                     struct rb_node **prev_ret,
                                      struct rb_node **next_ret,
+                                     struct rb_node **prev_ret,
                                      struct rb_node ***p_ret,
                                      struct rb_node **parent_ret)
 {
@@ -311,23 +342,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
        if (parent_ret)
                *parent_ret = prev;
 
-       if (prev_ret) {
+       if (next_ret) {
                orig_prev = prev;
                while (prev && offset > prev_entry->end) {
                        prev = rb_next(prev);
                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                }
-               *prev_ret = prev;
+               *next_ret = prev;
                prev = orig_prev;
        }
 
-       if (next_ret) {
+       if (prev_ret) {
                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                while (prev && offset < prev_entry->start) {
                        prev = rb_prev(prev);
                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                }
-               *next_ret = prev;
+               *prev_ret = prev;
        }
        return NULL;
 }
@@ -338,12 +369,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
                       struct rb_node ***p_ret,
                       struct rb_node **parent_ret)
 {
-       struct rb_node *prev = NULL;
+       struct rb_node *next= NULL;
        struct rb_node *ret;
 
-       ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+       ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
        if (!ret)
-               return prev;
+               return next;
        return ret;
 }
 
@@ -585,7 +616,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
        if (delete)
                bits |= ~EXTENT_CTLBITS;
-       bits |= EXTENT_FIRST_DELALLOC;
 
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
@@ -850,7 +880,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
        btrfs_debug_check_extent_io_range(tree, start, end);
 
-       bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
@@ -2692,28 +2721,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
        return bio;
 }
 
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
-                                      unsigned long bio_flags)
-{
-       blk_status_t ret = 0;
-       struct bio_vec *bvec = bio_last_bvec_all(bio);
-       struct page *page = bvec->bv_page;
-       struct extent_io_tree *tree = bio->bi_private;
-       u64 start;
-
-       start = page_offset(page) + bvec->bv_offset;
-
-       bio->bi_private = NULL;
-
-       if (tree->ops)
-               ret = tree->ops->submit_bio_hook(tree->private_data, bio,
-                                          mirror_num, bio_flags, start);
-       else
-               btrfsic_submit_bio(bio);
-
-       return blk_status_to_errno(ret);
-}
-
 /*
  * @opf:       bio REQ_OP_* and REQ_* flags as one value
  * @tree:      tree so we can call our merge_bio hook
@@ -4007,17 +4014,6 @@ retry:
        return ret;
 }
 
-static void flush_write_bio(struct extent_page_data *epd)
-{
-       if (epd->bio) {
-               int ret;
-
-               ret = submit_one_bio(epd->bio, 0, 0);
-               BUG_ON(ret < 0); /* -ENOMEM */
-               epd->bio = NULL;
-       }
-}
-
 int extent_write_full_page(struct page *page, struct writeback_control *wbc)
 {
        int ret;
@@ -4259,8 +4255,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
                if (len == 0)
                        break;
                len = ALIGN(len, sectorsize);
-               em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
-                               len, 0);
+               em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
                if (IS_ERR_OR_NULL(em))
                        return em;
 
index 9673be3f3d1f6a51f264cb8e774fc021281ede88..08749e0b9c32d52a8c70ce1c08a8f8f2cda2f614 100644 (file)
 #define EXTENT_BOUNDARY                (1U << 9)
 #define EXTENT_NODATASUM       (1U << 10)
 #define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_FIRST_DELALLOC  (1U << 12)
-#define EXTENT_NEED_WAIT       (1U << 13)
-#define EXTENT_DAMAGED         (1U << 14)
-#define EXTENT_NORESERVE       (1U << 15)
-#define EXTENT_QGROUP_RESERVED (1U << 16)
-#define EXTENT_CLEAR_DATA_RESV (1U << 17)
-#define EXTENT_DELALLOC_NEW    (1U << 18)
+#define EXTENT_NEED_WAIT       (1U << 12)
+#define EXTENT_DAMAGED         (1U << 13)
+#define EXTENT_NORESERVE       (1U << 14)
+#define EXTENT_QGROUP_RESERVED (1U << 15)
+#define EXTENT_CLEAR_DATA_RESV (1U << 16)
+#define EXTENT_DELALLOC_NEW    (1U << 17)
 #define EXTENT_IOBITS          (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
                                 EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING)
 
 /*
  * flags for bio submission. The high bits indicate the compression
index a042a193c12085b63e7b1437e97e4d388f14b36b..928f729c55baf61b02e7a4fb612eaebffbdc2d50 100644 (file)
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        if (!list_empty(&prev->list) || !list_empty(&next->list))
                return 0;
 
+       ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+              prev->block_start != EXTENT_MAP_DELALLOC);
+
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
              prev->block_start == EXTENT_MAP_HOLE) ||
             (next->block_start == EXTENT_MAP_INLINE &&
              prev->block_start == EXTENT_MAP_INLINE) ||
-            (next->block_start == EXTENT_MAP_DELALLOC &&
-             prev->block_start == EXTENT_MAP_DELALLOC) ||
             (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
              next->block_start == extent_map_block_end(prev)))) {
                return 1;
index ef05a0121652e8f54d9e29d21445e2dd1cdb936e..473f039fcd7c783f222a1be18e19c5b62b0e9271 100644 (file)
@@ -9,6 +9,7 @@
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
 #define EXTENT_MAP_HOLE ((u64)-3)
 #define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
 #define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the extent_map::flags field */
index d38dc8c31533773df9ac8cef15254319df824cba..34fe8a58b0e9cb65d5faddfcec1b655858ed8589 100644 (file)
@@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
                         &cached_state);
 
        while (start < inode->i_size) {
-               em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
-                               start, len, 0);
+               em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
                        em = NULL;
index 5c349667c761b5fed05a15d565c69eff015709e8..3f180b857e202bc628a65ff0955606f880193de5 100644 (file)
@@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 blocksize = fs_info->sectorsize;
        u64 actual_end;
-       u64 isize = i_size_read(inode);
        int ret = 0;
        struct page **pages = NULL;
        unsigned long nr_pages;
@@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
        inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
                        SZ_16K);
 
-       actual_end = min_t(u64, isize, end + 1);
+       actual_end = min_t(u64, i_size_read(inode), end + 1);
 again:
        will_compress = 0;
        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
  * queued.  We walk all the async extents created by compress_file_range
  * and send them down to the disk.
  */
-static noinline void submit_compressed_extents(struct inode *inode,
-                                             struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
 {
+       struct inode *inode = async_cow->inode;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct async_extent *async_extent;
        u64 alloc_hint = 0;
@@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
            5 * SZ_1M)
                cond_wake_up_nomb(&fs_info->async_submit_wait);
 
+       /*
+        * ->inode could be NULL if async_cow_start has failed to compress,
+        * in which case we don't have anything to submit, yet we need to
+        * always adjust ->async_delalloc_pages as its paired with the init
+        * happening in cow_file_range_async
+        */
        if (async_cow->inode)
-               submit_compressed_extents(async_cow->inode, async_cow);
+               submit_compressed_extents(async_cow);
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
@@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
                BUG_ON(!async_cow); /* -ENOMEM */
-               async_cow->inode = igrab(inode);
+               /*
+                * igrab is called higher up in the call chain, take only the
+                * lightweight reference for the callback lifetime
+                */
+               ihold(inode);
+               async_cow->inode = inode;
                async_cow->fs_info = fs_info;
                async_cow->locked_page = locked_page;
                async_cow->start = start;
@@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
  * Function to process delayed allocation (create CoW) for ranges which are
  * being touched for the first time.
  */
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
                u64 start, u64 end, int *page_started, unsigned long *nr_written,
                struct writeback_control *wbc)
 {
-       struct inode *inode = private_data;
        int ret;
        int force_cow = need_force_cow(inode, start, end);
        unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
        if (atomic_add_unless(&inode->i_count, -1, 1))
                return;
 
+       atomic_inc(&fs_info->nr_delayed_iputs);
        spin_lock(&fs_info->delayed_iput_lock);
        ASSERT(list_empty(&binode->delayed_iput));
        list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
                list_del_init(&inode->delayed_iput);
                spin_unlock(&fs_info->delayed_iput_lock);
                iput(&inode->vfs_inode);
+               if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+                       wake_up(&fs_info->delayed_iputs_wait);
                spin_lock(&fs_info->delayed_iput_lock);
        }
        spin_unlock(&fs_info->delayed_iput_lock);
 }
 
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set.  Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+       int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+                       atomic_read(&fs_info->nr_delayed_iputs) == 0);
+       if (ret)
+               return -EINTR;
+       return 0;
+}
+
 /*
  * This creates an orphan entry for the given inode in case something goes wrong
  * in the middle of an unlink.
@@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
        int failures = 0;
 
        for (;;) {
                struct btrfs_trans_handle *trans;
                int ret;
 
-               ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+               ret = btrfs_block_rsv_refill(root, rsv,
+                                            rsv->size + delayed_refs_extra,
                                             BTRFS_RESERVE_FLUSH_LIMIT);
 
                if (ret && ++failures > 2) {
@@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
                        return ERR_PTR(-ENOSPC);
                }
 
+               /*
+                * Evict can generate a large amount of delayed refs without
+                * having a way to add space back since we exhaust our temporary
+                * block rsv.  We aren't allowed to do FLUSH_ALL in this case
+                * because we could deadlock with so many things in the flushing
+                * code, so we have to try and hold some extra space to
+                * compensate for our delayed ref generation.  If we can't get
+                * that space then we need see if we can steal our minimum from
+                * the global reserve.  We will be ratelimited by the amount of
+                * space we have for the delayed refs rsv, so we'll end up
+                * committing and trying again.
+                */
                trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans) || !ret)
+               if (IS_ERR(trans) || !ret) {
+                       if (!IS_ERR(trans)) {
+                               trans->block_rsv = &fs_info->trans_block_rsv;
+                               trans->bytes_reserved = delayed_refs_extra;
+                               btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+                                                       delayed_refs_extra, 1);
+                       }
                        return trans;
+               }
 
                /*
                 * Try to steal from the global reserve if there is space for
@@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        u64 extent_start = 0;
        u64 extent_end = 0;
        u64 objectid = btrfs_ino(inode);
-       u32 found_type;
+       u8 extent_type;
        struct btrfs_path *path = NULL;
        struct btrfs_root *root = inode->root;
        struct btrfs_file_extent_item *item;
@@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        if (ret < 0) {
                err = ret;
                goto out;
-       }
-
-       if (ret != 0) {
+       } else if (ret > 0) {
                if (path->slots[0] == 0)
                        goto not_found;
                path->slots[0]--;
@@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
-       /* are we inside the extent that was found? */
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-       found_type = found_key.type;
        if (found_key.objectid != objectid ||
-           found_type != BTRFS_EXTENT_DATA_KEY) {
+           found_key.type != BTRFS_EXTENT_DATA_KEY) {
                /*
                 * If we backup past the first extent we want to move forward
                 * and see if there is an extent in front of us, otherwise we'll
@@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
                goto next;
        }
 
-       found_type = btrfs_file_extent_type(leaf, item);
+       extent_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-       if (found_type == BTRFS_FILE_EXTENT_REG ||
-           found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+       if (extent_type == BTRFS_FILE_EXTENT_REG ||
+           extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
                       btrfs_file_extent_num_bytes(leaf, item);
 
                trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
                                                       extent_start);
-       } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+       } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                size_t size;
 
                size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6840,9 +6888,9 @@ next:
                        if (ret < 0) {
                                err = ret;
                                goto out;
-                       }
-                       if (ret > 0)
+                       } else if (ret > 0) {
                                goto not_found;
+                       }
                        leaf = path->nodes[0];
                }
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6853,19 +6901,22 @@ next:
                        goto not_found;
                if (start > found_key.offset)
                        goto next;
+
+               /* New extent overlaps with existing one */
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
-               goto not_found_em;
+               em->block_start = EXTENT_MAP_HOLE;
+               goto insert;
        }
 
        btrfs_extent_item_to_extent_map(inode, path, item,
                        new_inline, em);
 
-       if (found_type == BTRFS_FILE_EXTENT_REG ||
-           found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+       if (extent_type == BTRFS_FILE_EXTENT_REG ||
+           extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                goto insert;
-       } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+       } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                unsigned long ptr;
                char *map;
                size_t size;
@@ -6916,7 +6967,6 @@ not_found:
        em->start = start;
        em->orig_start = start;
        em->len = len;
-not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
 insert:
        btrfs_release_path(path);
@@ -6946,19 +6996,17 @@ out:
 }
 
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-               struct page *page,
-               size_t pg_offset, u64 start, u64 len,
-               int create)
+                                          u64 start, u64 len)
 {
        struct extent_map *em;
        struct extent_map *hole_em = NULL;
-       u64 range_start = start;
+       u64 delalloc_start = start;
        u64 end;
-       u64 found;
-       u64 found_end;
+       u64 delalloc_len;
+       u64 delalloc_end;
        int err = 0;
 
-       em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+       em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em))
                return em;
        /*
@@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
        em = NULL;
 
        /* ok, we didn't find anything, lets look for delalloc */
-       found = count_range_bits(&inode->io_tree, &range_start,
+       delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
                                 end, len, EXTENT_DELALLOC, 1);
-       found_end = range_start + found;
-       if (found_end < range_start)
-               found_end = (u64)-1;
+       delalloc_end = delalloc_start + delalloc_len;
+       if (delalloc_end < delalloc_start)
+               delalloc_end = (u64)-1;
 
        /*
-        * we didn't find anything useful, return
-        * the original results from get_extent()
+        * We didn't find anything useful, return the original results from
+        * get_extent()
         */
-       if (range_start > end || found_end <= start) {
+       if (delalloc_start > end || delalloc_end <= start) {
                em = hole_em;
                hole_em = NULL;
                goto out;
        }
 
-       /* adjust the range_start to make sure it doesn't
-        * go backwards from the start they passed in
+       /*
+        * Adjust the delalloc_start to make sure it doesn't go backwards from
+        * the start they passed in
         */
-       range_start = max(start, range_start);
-       found = found_end - range_start;
+       delalloc_start = max(start, delalloc_start);
+       delalloc_len = delalloc_end - delalloc_start;
 
-       if (found > 0) {
-               u64 hole_start = start;
-               u64 hole_len = len;
+       if (delalloc_len > 0) {
+               u64 hole_start;
+               u64 hole_len;
+               const u64 hole_end = extent_map_end(hole_em);
 
                em = alloc_extent_map();
                if (!em) {
                        err = -ENOMEM;
                        goto out;
                }
+               em->bdev = NULL;
+
+               ASSERT(hole_em);
                /*
-                * when btrfs_get_extent can't find anything it
-                * returns one huge hole
+                * When btrfs_get_extent can't find anything it returns one
+                * huge hole
                 *
-                * make sure what it found really fits our range, and
-                * adjust to make sure it is based on the start from
-                * the caller
+                * Make sure what it found really fits our range, and adjust to
+                * make sure it is based on the start from the caller
                 */
-               if (hole_em) {
-                       u64 calc_end = extent_map_end(hole_em);
-
-                       if (calc_end <= start || (hole_em->start > end)) {
-                               free_extent_map(hole_em);
-                               hole_em = NULL;
-                       } else {
-                               hole_start = max(hole_em->start, start);
-                               hole_len = calc_end - hole_start;
-                       }
+               if (hole_end <= start || hole_em->start > end) {
+                      free_extent_map(hole_em);
+                      hole_em = NULL;
+               } else {
+                      hole_start = max(hole_em->start, start);
+                      hole_len = hole_end - hole_start;
                }
-               em->bdev = NULL;
-               if (hole_em && range_start > hole_start) {
-                       /* our hole starts before our delalloc, so we
-                        * have to return just the parts of the hole
-                        * that go until  the delalloc starts
+
+               if (hole_em && delalloc_start > hole_start) {
+                       /*
+                        * Our hole starts before our delalloc, so we have to
+                        * return just the parts of the hole that go until the
+                        * delalloc starts
                         */
-                       em->len = min(hole_len,
-                                     range_start - hole_start);
+                       em->len = min(hole_len, delalloc_start - hole_start);
                        em->start = hole_start;
                        em->orig_start = hole_start;
                        /*
-                        * don't adjust block start at all,
-                        * it is fixed at EXTENT_MAP_HOLE
+                        * Don't adjust block start at all, it is fixed at
+                        * EXTENT_MAP_HOLE
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
-                       em->start = range_start;
-                       em->len = found;
-                       em->orig_start = range_start;
+                       /*
+                        * Hole is out of passed range or it starts after
+                        * delalloc range
+                        */
+                       em->start = delalloc_start;
+                       em->len = delalloc_len;
+                       em->orig_start = delalloc_start;
                        em->block_start = EXTENT_MAP_DELALLOC;
-                       em->block_len = found;
+                       em->block_len = delalloc_len;
                }
        } else {
                return hole_em;
@@ -9910,7 +9962,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
        init_completion(&work->completion);
        INIT_LIST_HEAD(&work->list);
        work->inode = inode;
-       WARN_ON_ONCE(!inode);
        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
                        btrfs_run_delalloc_work, NULL, NULL);
 
index 9c8e1734429c7649721eeadd3dca941ff2ee490c..494f0f10d70e091664bfa14fd1c6639b3a0b4714 100644 (file)
@@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                btrfs_info(fs_info, "resizing devid %llu", devid);
        }
 
-       device = btrfs_find_device(fs_info, devid, NULL, NULL);
+       device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
        if (!device) {
                btrfs_info(fs_info, "resizer unable to find device %llu",
                           devid);
@@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
                s_uuid = di_args->uuid;
 
        rcu_read_lock();
-       dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+                               NULL, true);
 
        if (!dev) {
                ret = -ENODEV;
@@ -3241,32 +3242,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
        lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
 
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
                                   struct inode *dst, u64 dst_loff)
 {
-       u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
        int ret;
-       u64 len = olen;
-
-       if (loff + len == src->i_size)
-               len = ALIGN(src->i_size, bs) - loff;
-       /*
-        * For same inode case we don't want our length pushed out past i_size
-        * as comparing that data range makes no sense.
-        *
-        * This effectively means we require aligned extents for the single
-        * inode case, whereas the other cases allow an unaligned length so long
-        * as it ends at i_size.
-        */
-       if (dst == src && len != olen)
-               return -EINVAL;
 
        /*
         * Lock destination range to serialize with concurrent readpages() and
         * source range to serialize with relocation.
         */
        btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
-       ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+       ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
        btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
 
        return ret;
@@ -3278,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                             struct inode *dst, u64 dst_loff)
 {
        int ret;
-       int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
        u64 i, tail_len, chunk_count;
 
-       /* don't make the dst file partly checksummed */
-       if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-           (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
-               return -EINVAL;
-
-       if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
-               return -ETXTBSY;
-
        tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
        chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
-       if (chunk_count == 0)
-               num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
 
        for (i = 0; i < chunk_count; i++) {
                ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@@ -3908,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
         *   be either compressed or non-compressed.
         */
 
-       /* don't make the dst file partly checksummed */
-       if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-           (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-               return -EINVAL;
-
-       if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
-               return -ETXTBSY;
-
        /*
         * VFS's generic_remap_file_range_prep() protects us from cloning the
         * eof block into the middle of a file, which would result in corruption
@@ -3991,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
        else
                btrfs_double_inode_lock(inode_in, inode_out);
 
+       /* don't make the dst file partly checksummed */
+       if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+           (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
        /*
         * Now that the inodes are locked, we need to start writeback ourselves
         * and can not rely on the writeback from the VFS's generic helper
@@ -4381,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
                              0);
 
-       if (copy_to_user(arg, sa, sizeof(*sa)))
+       if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
 
        if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4414,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
 
        ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
 
-       if (copy_to_user(arg, sa, sizeof(*sa)))
+       if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
 
        kfree(sa);
@@ -4438,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
 
        ret = btrfs_get_dev_stats(fs_info, sa);
 
-       if (copy_to_user(arg, sa, sizeof(*sa)))
+       if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
 
        kfree(sa);
@@ -4484,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
                break;
        }
 
-       if (copy_to_user(arg, p, sizeof(*p)))
+       if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
                ret = -EFAULT;
 out:
        kfree(p);
@@ -4790,7 +4764,7 @@ do_balance:
        ret = btrfs_balance(fs_info, bctl, bargs);
        bctl = NULL;
 
-       if (arg) {
+       if ((ret == 0 || ret == -ECANCELED) && arg) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
        }
index 1da768e5ef75bda36142c084d6ba7c2b5c4bc809..82b84e4daad1f336fc8b6de895d445682308f3ef 100644 (file)
 
 static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
 {
        /*
-        * no lock is required.  The lock owner may change if
-        * we have a read lock, but it won't change to or away
-        * from us.  If we have the write lock, we are the owner
-        * and it'll never change.
+        * No lock is required.  The lock owner may change if we have a read
+        * lock, but it won't change to or away from us.  If we have the write
+        * lock, we are the owner and it'll never change.
         */
        if (eb->lock_nested && current->pid == eb->lock_owner)
                return;
-       if (rw == BTRFS_WRITE_LOCK) {
-               if (atomic_read(&eb->blocking_writers) == 0) {
-                       WARN_ON(atomic_read(&eb->spinning_writers) != 1);
-                       atomic_dec(&eb->spinning_writers);
-                       btrfs_assert_tree_locked(eb);
-                       atomic_inc(&eb->blocking_writers);
-                       write_unlock(&eb->lock);
-               }
-       } else if (rw == BTRFS_READ_LOCK) {
-               btrfs_assert_tree_read_locked(eb);
-               atomic_inc(&eb->blocking_readers);
-               WARN_ON(atomic_read(&eb->spinning_readers) == 0);
-               atomic_dec(&eb->spinning_readers);
-               read_unlock(&eb->lock);
+       btrfs_assert_tree_read_locked(eb);
+       atomic_inc(&eb->blocking_readers);
+       WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+       atomic_dec(&eb->spinning_readers);
+       read_unlock(&eb->lock);
+}
+
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+       /*
+        * No lock is required.  The lock owner may change if we have a read
+        * lock, but it won't change to or away from us.  If we have the write
+        * lock, we are the owner and it'll never change.
+        */
+       if (eb->lock_nested && current->pid == eb->lock_owner)
+               return;
+       if (atomic_read(&eb->blocking_writers) == 0) {
+               WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+               atomic_dec(&eb->spinning_writers);
+               btrfs_assert_tree_locked(eb);
+               atomic_inc(&eb->blocking_writers);
+               write_unlock(&eb->lock);
        }
 }
 
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
+{
+       /*
+        * No lock is required.  The lock owner may change if we have a read
+        * lock, but it won't change to or away from us.  If we have the write
+        * lock, we are the owner and it'll never change.
+        */
+       if (eb->lock_nested && current->pid == eb->lock_owner)
+               return;
+       BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+       read_lock(&eb->lock);
+       atomic_inc(&eb->spinning_readers);
+       /* atomic_dec_and_test implies a barrier */
+       if (atomic_dec_and_test(&eb->blocking_readers))
+               cond_wake_up_nomb(&eb->read_lock_wq);
+}
+
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
 {
        /*
         * no lock is required.  The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
         */
        if (eb->lock_nested && current->pid == eb->lock_owner)
                return;
-
-       if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
-               BUG_ON(atomic_read(&eb->blocking_writers) != 1);
-               write_lock(&eb->lock);
-               WARN_ON(atomic_read(&eb->spinning_writers));
-               atomic_inc(&eb->spinning_writers);
-               /* atomic_dec_and_test implies a barrier */
-               if (atomic_dec_and_test(&eb->blocking_writers))
-                       cond_wake_up_nomb(&eb->write_lock_wq);
-       } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
-               BUG_ON(atomic_read(&eb->blocking_readers) == 0);
-               read_lock(&eb->lock);
-               atomic_inc(&eb->spinning_readers);
-               /* atomic_dec_and_test implies a barrier */
-               if (atomic_dec_and_test(&eb->blocking_readers))
-                       cond_wake_up_nomb(&eb->read_lock_wq);
-       }
+       BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+       write_lock(&eb->lock);
+       WARN_ON(atomic_read(&eb->spinning_writers));
+       atomic_inc(&eb->spinning_writers);
+       /* atomic_dec_and_test implies a barrier */
+       if (atomic_dec_and_test(&eb->blocking_writers))
+               cond_wake_up_nomb(&eb->write_lock_wq);
 }
 
 /*
@@ -232,16 +237,9 @@ again:
        wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
        write_lock(&eb->lock);
-       if (atomic_read(&eb->blocking_readers)) {
+       if (atomic_read(&eb->blocking_readers) ||
+           atomic_read(&eb->blocking_writers)) {
                write_unlock(&eb->lock);
-               wait_event(eb->read_lock_wq,
-                          atomic_read(&eb->blocking_readers) == 0);
-               goto again;
-       }
-       if (atomic_read(&eb->blocking_writers)) {
-               write_unlock(&eb->lock);
-               wait_event(eb->write_lock_wq,
-                          atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        WARN_ON(atomic_read(&eb->spinning_writers));
index 29135def468e97ab240ddf11d0c51c2555dbb5dc..595014f64830f822d6d9135641e528353fdfe4f3 100644 (file)
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
 void btrfs_tree_read_lock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
                BUG();
 }
 
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
-       btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
-       btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
 #endif
index 90639140439fe8f5586ccc1cd676b320cb010659..579d53ae256f3e63396d415f7efa8f976b7f5229 100644 (file)
@@ -61,6 +61,28 @@ struct workspace {
        struct list_head list;
 };
 
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+       btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+       btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+       return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+       btrfs_put_workspace(&wsm, ws);
+}
+
 static void lzo_free_workspace(struct list_head *ws)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
        kfree(workspace);
 }
 
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
 {
        struct workspace *workspace;
 
@@ -485,11 +507,16 @@ out:
        return ret;
 }
 
-static void lzo_set_level(struct list_head *ws, unsigned int type)
+static unsigned int lzo_set_level(unsigned int level)
 {
+       return 0;
 }
 
 const struct btrfs_compress_op btrfs_lzo_compress = {
+       .init_workspace_manager = lzo_init_workspace_manager,
+       .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+       .get_workspace          = lzo_get_workspace,
+       .put_workspace          = lzo_put_workspace,
        .alloc_workspace        = lzo_alloc_workspace,
        .free_workspace         = lzo_free_workspace,
        .compress_pages         = lzo_compress_pages,
index 4e473a9982191b0bc0f2fd9036cfdb88fe51194b..c1cd5558a6462ce6b3fc4520d9dc4ec011c991d5 100644 (file)
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
                                 node);
-               if (bytenr < entry->bytenr)
+               if (bytenr < entry->bytenr) {
                        p = &(*p)->rb_left;
-               else if (bytenr > entry->bytenr)
+               } else if (bytenr > entry->bytenr) {
                        p = &(*p)->rb_right;
-               else
+               } else {
+                       if (record->data_rsv && !entry->data_rsv) {
+                               entry->data_rsv = record->data_rsv;
+                               entry->data_rsv_refroot =
+                                       record->data_rsv_refroot;
+                       }
                        return 1;
+               }
        }
 
        rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
            || bytenr == 0 || num_bytes == 0)
                return 0;
-       record = kmalloc(sizeof(*record), gfp_flag);
+       record = kzalloc(sizeof(*record), gfp_flag);
        if (!record)
                return -ENOMEM;
 
@@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
                        src_path->nodes[cur_level] = eb;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                }
 
@@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
                dst_path->slots[cur_level] = 0;
 
                btrfs_tree_read_lock(eb);
-               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+               btrfs_set_lock_blocking_read(eb);
                dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
                need_cleanup = true;
        }
@@ -2017,86 +2023,30 @@ out:
        return ret;
 }
 
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
-                               struct btrfs_block_group_cache *bg_cache,
-                               struct extent_buffer *src_parent, int src_slot,
-                               struct extent_buffer *dst_parent, int dst_slot,
-                               u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+                               struct extent_buffer *src_eb,
+                               struct extent_buffer *dst_eb,
+                               u64 last_snapshot, bool trace_leaf)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *dst_path = NULL;
-       struct btrfs_key first_key;
-       struct extent_buffer *src_eb = NULL;
-       struct extent_buffer *dst_eb = NULL;
-       bool trace_leaf = false;
-       u64 child_gen;
-       u64 child_bytenr;
        int level;
        int ret;
 
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
                return 0;
 
-       /* Check parameter order */
-       if (btrfs_node_ptr_generation(src_parent, src_slot) >
-           btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+       /* Wrong parameter order */
+       if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
                btrfs_err_rl(fs_info,
                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
-                       btrfs_node_ptr_generation(src_parent, src_slot),
-                       btrfs_node_ptr_generation(dst_parent, dst_slot));
+                            btrfs_header_generation(src_eb),
+                            btrfs_header_generation(dst_eb));
                return -EUCLEAN;
        }
 
-       /*
-        * Only trace leaf if we're relocating data block groups, this could
-        * reduce tons of data extents tracing for meta/sys bg relocation.
-        */
-       if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
-               trace_leaf = true;
-       /* Read out real @src_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
-       child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
-       btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
-       src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(src_parent) - 1, &first_key);
-       if (IS_ERR(src_eb)) {
-               ret = PTR_ERR(src_eb);
-               goto out;
-       }
-
-       /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
-       child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
-       child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
-       btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
-       dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-                       btrfs_header_level(dst_parent) - 1, &first_key);
-       if (IS_ERR(dst_eb)) {
-               ret = PTR_ERR(dst_eb);
-               goto out;
-       }
-
        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
-               ret = -EINVAL;
+               ret = -EIO;
                goto out;
        }
 
@@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
                ret = -ENOMEM;
                goto out;
        }
-
        /* For dst_path */
        extent_buffer_get(dst_eb);
        dst_path->nodes[level] = dst_eb;
        dst_path->slots[level] = 0;
        dst_path->locks[level] = 0;
 
-       /* Do the generation-aware breadth-first search */
+       /* Do the generation aware breadth-first search */
        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
                                              level, last_snapshot, trace_leaf);
        if (ret < 0)
@@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
        ret = 0;
 
 out:
-       free_extent_buffer(src_eb);
-       free_extent_buffer(dst_eb);
        btrfs_free_path(dst_path);
        if (ret < 0)
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2154,7 @@ walk_down:
                        path->slots[level] = 0;
 
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
 
                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                                        goto cleanup;
                        }
 
+                       /* Free the reserved data space */
+                       btrfs_qgroup_free_refroot(fs_info,
+                                       record->data_rsv_refroot,
+                                       record->data_rsv,
+                                       BTRFS_QGROUP_RSV_DATA);
                        /*
                         * Use SEQ_LAST as time_seq to do special search, which
                         * doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2794,15 @@ out:
 /*
  * Two limits to commit transaction in advance.
  *
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
  * For SIZE, it will be in byte unit as threshold.
  */
-#define QGROUP_PERTRANS_RATIO          32
-#define QGROUP_PERTRANS_SIZE           SZ_32M
+#define QGROUP_FREE_RATIO              32
+#define QGROUP_FREE_SIZE               SZ_32M
 static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
                                const struct btrfs_qgroup *qg, u64 num_bytes)
 {
-       u64 limit;
+       u64 free;
        u64 threshold;
 
        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
         */
        if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
                              BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
-               if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-                       limit = qg->max_excl;
-               else
-                       limit = qg->max_rfer;
-               threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
-                           qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
-                           QGROUP_PERTRANS_RATIO;
-               threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+               if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+                       free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+                       threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+                                         QGROUP_FREE_SIZE);
+               } else {
+                       free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+                       threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+                                         QGROUP_FREE_SIZE);
+               }
 
                /*
                 * Use transaction_kthread to commit transaction, so we no
                 * longer need to bother nested transaction nor lock context.
                 */
-               if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+               if (free < threshold)
                        btrfs_commit_transaction_locksafe(fs_info);
        }
 
@@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
                qgroup_rsv_add(fs_info, qg, num_bytes, type);
        }
 
@@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
                qgroup_rsv_release(fs_info, qg, num_bytes, type);
 
                list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
        }
        extent_changeset_release(&changeset);
 }
+
+void btrfs_qgroup_init_swapped_blocks(
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+       int i;
+
+       spin_lock_init(&swapped_blocks->lock);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+               swapped_blocks->blocks[i] = RB_ROOT;
+       swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+       int i;
+
+       swapped_blocks = &root->swapped_blocks;
+
+       spin_lock(&swapped_blocks->lock);
+       if (!swapped_blocks->swapped)
+               goto out;
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               struct rb_root *cur_root = &swapped_blocks->blocks[i];
+               struct btrfs_qgroup_swapped_block *entry;
+               struct btrfs_qgroup_swapped_block *next;
+
+               rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+                                                    node)
+                       kfree(entry);
+               swapped_blocks->blocks[i] = RB_ROOT;
+       }
+       swapped_blocks->swapped = false;
+out:
+       spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root:       tree root of the subvolume tree get swapped
+ * @bg:                        block group under balance
+ * @subvol_parent/slot:        pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ *                     BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot:     last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+               struct btrfs_root *subvol_root,
+               struct btrfs_block_group_cache *bg,
+               struct extent_buffer *subvol_parent, int subvol_slot,
+               struct extent_buffer *reloc_parent, int reloc_slot,
+               u64 last_snapshot)
+{
+       struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct rb_node **cur;
+       struct rb_node *parent = NULL;
+       int level = btrfs_header_level(subvol_parent) - 1;
+       int ret = 0;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+
+       if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+           btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+               btrfs_err_rl(fs_info,
+               "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+                       __func__,
+                       btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+                       btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+               return -EUCLEAN;
+       }
+
+       block = kmalloc(sizeof(*block), GFP_NOFS);
+       if (!block) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * @reloc_parent/slot is still before swap, while @block is going to
+        * record the bytenr after swap, so we do the swap here.
+        */
+       block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+       block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+                                                            reloc_slot);
+       block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+       block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+                                                           subvol_slot);
+       block->last_snapshot = last_snapshot;
+       block->level = level;
+       if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+               block->trace_leaf = true;
+       else
+               block->trace_leaf = false;
+       btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+       /* Insert @block into @blocks */
+       spin_lock(&blocks->lock);
+       cur = &blocks->blocks[level].rb_node;
+       while (*cur) {
+               struct btrfs_qgroup_swapped_block *entry;
+
+               parent = *cur;
+               entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+                                node);
+
+               if (entry->subvol_bytenr < block->subvol_bytenr) {
+                       cur = &(*cur)->rb_left;
+               } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+                       cur = &(*cur)->rb_right;
+               } else {
+                       if (entry->subvol_generation !=
+                                       block->subvol_generation ||
+                           entry->reloc_bytenr != block->reloc_bytenr ||
+                           entry->reloc_generation !=
+                                       block->reloc_generation) {
+                               /*
+                                * Duplicated but mismatch entry found.
+                                * Shouldn't happen.
+                                *
+                                * Marking qgroup inconsistent should be enough
+                                * for end users.
+                                */
+                               WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+                               ret = -EEXIST;
+                       }
+                       kfree(block);
+                       goto out_unlock;
+               }
+       }
+       rb_link_node(&block->node, parent, cur);
+       rb_insert_color(&block->node, &blocks->blocks[level]);
+       blocks->swapped = true;
+out_unlock:
+       spin_unlock(&blocks->lock);
+out:
+       if (ret < 0)
+               fs_info->qgroup_flags |=
+                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *subvol_eb)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct extent_buffer *reloc_eb = NULL;
+       struct rb_node *node;
+       bool found = false;
+       bool swapped = false;
+       int level = btrfs_header_level(subvol_eb);
+       int ret = 0;
+       int i;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+       if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+               return 0;
+
+       spin_lock(&blocks->lock);
+       if (!blocks->swapped) {
+               spin_unlock(&blocks->lock);
+               return 0;
+       }
+       node = blocks->blocks[level].rb_node;
+
+       while (node) {
+               block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+               if (block->subvol_bytenr < subvol_eb->start) {
+                       node = node->rb_left;
+               } else if (block->subvol_bytenr > subvol_eb->start) {
+                       node = node->rb_right;
+               } else {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               spin_unlock(&blocks->lock);
+               goto out;
+       }
+       /* Found one, remove it from @blocks first and update blocks->swapped */
+       rb_erase(&block->node, &blocks->blocks[level]);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+                       swapped = true;
+                       break;
+               }
+       }
+       blocks->swapped = swapped;
+       spin_unlock(&blocks->lock);
+
+       /* Read out reloc subtree root */
+       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+                                  block->reloc_generation, block->level,
+                                  &block->first_key);
+       if (IS_ERR(reloc_eb)) {
+               ret = PTR_ERR(reloc_eb);
+               reloc_eb = NULL;
+               goto free_out;
+       }
+       if (!extent_buffer_uptodate(reloc_eb)) {
+               ret = -EIO;
+               goto free_out;
+       }
+
+       ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+                       block->last_snapshot, block->trace_leaf);
+free_out:
+       kfree(block);
+       free_extent_buffer(reloc_eb);
+out:
+       if (ret < 0) {
+               btrfs_err_rl(fs_info,
+                            "failed to account subtree at bytenr %llu: %d",
+                            subvol_eb->start, ret);
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       }
+       return ret;
+}
index 20c6bd5fa701d2563623a4507a0afdd60cfc8c07..46ba7bd2961cd1edcbbabb341b3fb350416e929b 100644 (file)
@@ -6,6 +6,8 @@
 #ifndef BTRFS_QGROUP_H
 #define BTRFS_QGROUP_H
 
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
 #include "ulist.h"
 #include "delayed-ref.h"
 
  *    Normally at qgroup rescan and transaction commit time.
  */
 
+/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ *    During subtree swap:
+ *    O = Old tree blocks
+ *    N = New tree blocks
+ *          reloc tree                     subvolume tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          NA     OB                          OA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      NC  ND     OE  OF                   OC  OD     OE  OF
+ *
+ *   In this case, NA and OA are going to be swapped, record (NA, OA) into
+ *   subvolume tree X.
+ *
+ * 2) After subtree swap.
+ *          reloc tree                     subvolume tree X
+ *             Root                               Root
+ *            /    \                             /    \
+ *          OA     OB                          NA      OB
+ *        /  |     |  \                      /  |      |  \
+ *      OC  OD     OE  OF                   NC  ND     OE  OF
+ *
+ * 3a) COW happens for OB
+ *     If we are going to COW tree block OB, we check OB's bytenr against
+ *     tree X's swapped_blocks structure.
+ *     If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ *     Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ *     Then we do subtree scan on both subtrees OA and NA.
+ *     Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ *     Then no matter what we do to subvolume tree X, qgroup numbers will
+ *     still be correct.
+ *     Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4)  Transaction commit
+ *     Any record in X's swapped_blocks gets removed, since there is no
+ *     modification to the swapped subtrees, no need to trigger heavy qgroup
+ *     subtree rescan for them.
+ */
+
 /*
  * Record a dirty extent, and info qgroup to update quota on it
  * TODO: Use kmem cache to alloc it.
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
        struct rb_node node;
        u64 bytenr;
        u64 num_bytes;
+
+       /*
+        * For qgroup reserved data space freeing.
+        *
+        * @data_rsv_refroot and @data_rsv will be recorded after
+        * BTRFS_ADD_DELAYED_EXTENT is called.
+        * And will be used to free reserved qgroup space at
+        * transaction commit time.
+        */
+       u32 data_rsv;           /* reserved data space needs to be freed */
+       u64 data_rsv_refroot;   /* which root the reserved data belongs to */
        struct ulist *old_roots;
 };
 
+struct btrfs_qgroup_swapped_block {
+       struct rb_node node;
+
+       int level;
+       bool trace_leaf;
+
+       /* bytenr/generation of the tree block in subvolume tree after swap */
+       u64 subvol_bytenr;
+       u64 subvol_generation;
+
+       /* bytenr/generation of the tree block in reloc tree after swap */
+       u64 reloc_bytenr;
+       u64 reloc_generation;
+
+       u64 last_snapshot;
+       struct btrfs_key first_key;
+};
+
 /*
  * Qgroup reservation types:
  *
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
                               struct extent_buffer *root_eb,
                               u64 root_gen, int root_level);
-
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
-                               struct btrfs_block_group_cache *bg_cache,
-                               struct extent_buffer *src_parent, int src_slot,
-                               struct extent_buffer *dst_parent, int dst_slot,
-                               u64 last_snapshot);
 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
                                u64 num_bytes, struct ulist *old_roots,
                                struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
                               u64 ref_root, u64 num_bytes,
                               enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
-                                                u64 ref_root, u64 num_bytes)
-{
-       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
-               return;
-       trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
-       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
-                                 BTRFS_QGROUP_RSV_DATA);
-}
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
 
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+       struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+               struct btrfs_root *subvol_root,
+               struct btrfs_block_group_cache *bg,
+               struct extent_buffer *subvol_parent, int subvol_slot,
+               struct extent_buffer *reloc_parent, int reloc_slot,
+               u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+               struct btrfs_root *root, struct extent_buffer *eb);
+
 #endif
index c3557c12656b1f5f7403ad50222c4741fe83d52f..d09b6cdb785a0a979a91d471e34844549d102df5 100644 (file)
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
                                return -EIO;
                        }
                        btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       btrfs_set_lock_blocking_read(eb);
                        path->nodes[level-1] = eb;
                        path->slots[level-1] = 0;
                        path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
                return -ENOMEM;
 
        eb = btrfs_read_lock_root_node(fs_info->extent_root);
-       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+       btrfs_set_lock_blocking_read(eb);
        level = btrfs_header_level(eb);
        path->nodes[level] = eb;
        path->slots[level] = 0;
index 272b287f8cf0dc438cdb6fd170f4ee467f4a69e1..ddf028509931289ab54c6a6feab771e6ed36f2d5 100644 (file)
@@ -162,6 +162,8 @@ struct reloc_control {
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
+       /* list of subvolume trees that get relocated */
+       struct list_head dirty_subvol_roots;
        /* size of metadata reservation for merging reloc trees */
        u64 merging_rsv_size;
        /* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        struct btrfs_root_item *root_item;
        int ret;
 
-       if (!root->reloc_root)
+       if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+           !root->reloc_root)
                goto out;
 
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
 
+       /* root->reloc_root will stay until current relocation finished */
        if (fs_info->reloc_ctl->merge_reloc_tree &&
            btrfs_root_refs(root_item) == 0) {
-               root->reloc_root = NULL;
+               set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
                __del_reloc_root(reloc_root);
        }
 
@@ -1773,7 +1777,7 @@ again:
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
 
        eb = btrfs_lock_root_node(dest);
-       btrfs_set_lock_blocking(eb);
+       btrfs_set_lock_blocking_write(eb);
        level = btrfs_header_level(eb);
 
        if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
                BUG_ON(ret);
        }
-       btrfs_set_lock_blocking(eb);
+       btrfs_set_lock_blocking_write(eb);
 
        if (next_key) {
                next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
                BUG_ON(level < lowest_level);
 
                ret = btrfs_bin_search(parent, &key, level, &slot);
+               if (ret < 0)
+                       break;
                if (ret && slot > 0)
                        slot--;
 
@@ -1852,7 +1858,7 @@ again:
                                                      slot, &eb);
                                BUG_ON(ret);
                        }
-                       btrfs_set_lock_blocking(eb);
+                       btrfs_set_lock_blocking_write(eb);
 
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
                 *    If not traced, we will leak data numbers
                 * 2) Fs subtree
                 *    If not traced, we will double count old data
-                *    and tree block numbers, if current trans doesn't free
-                *    data reloc tree inode.
+                *
+                * We don't scan the subtree right now, but only record
+                * the swapped tree blocks.
+                * The real subtree rescan is delayed until we have new
+                * CoW on the subtree root node before transaction commit.
                 */
-               ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
-                               parent, slot, path->nodes[level],
-                               path->slots[level], last_snapshot);
+               ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+                               rc->block_group, parent, slot,
+                               path->nodes[level], path->slots[level],
+                               last_snapshot);
                if (ret < 0)
                        break;
-
                /*
                 * swap blocks in fs tree and reloc tree.
                 */
@@ -2120,6 +2129,58 @@ static int find_next_key(struct btrfs_path *path, int level,
        return 1;
 }
 
+/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+                               struct reloc_control *rc,
+                               struct btrfs_root *root)
+{
+       struct btrfs_root *reloc_root = root->reloc_root;
+       struct btrfs_root_item *reloc_root_item;
+
+       /* @root must be a subvolume tree root with a valid reloc tree */
+       ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+       ASSERT(reloc_root);
+
+       reloc_root_item = &reloc_root->root_item;
+       memset(&reloc_root_item->drop_progress, 0,
+               sizeof(reloc_root_item->drop_progress));
+       reloc_root_item->drop_level = 0;
+       btrfs_set_root_refs(reloc_root_item, 0);
+       btrfs_update_reloc_root(trans, root);
+
+       if (list_empty(&root->reloc_dirty_list)) {
+               btrfs_grab_fs_root(root);
+               list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+       }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+       struct btrfs_root *root;
+       struct btrfs_root *next;
+       int ret = 0;
+
+       list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+                                reloc_dirty_list) {
+               struct btrfs_root *reloc_root = root->reloc_root;
+
+               clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+               list_del_init(&root->reloc_dirty_list);
+               root->reloc_root = NULL;
+               if (reloc_root) {
+                       int ret2;
+
+                       ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+                       if (ret2 < 0 && !ret)
+                               ret = ret2;
+               }
+               btrfs_put_fs_root(root);
+       }
+       return ret;
+}
+
 /*
  * merge the relocated tree blocks in reloc tree with corresponding
  * fs tree.
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                                               struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-       LIST_HEAD(inode_list);
        struct btrfs_key key;
        struct btrfs_key next_key;
        struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 out:
        btrfs_free_path(path);
 
-       if (err == 0) {
-               memset(&root_item->drop_progress, 0,
-                      sizeof(root_item->drop_progress));
-               root_item->drop_level = 0;
-               btrfs_set_root_refs(root_item, 0);
-               btrfs_update_reloc_root(trans, root);
-       }
+       if (err == 0)
+               insert_dirty_subvol(trans, rc, root);
 
        if (trans)
                btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
-
-               ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
-               if (ret < 0) {
-                       if (list_empty(&reloc_root->root_list))
-                               list_add_tail(&reloc_root->root_list,
-                                             &reloc_roots);
-                       goto out;
-               }
        }
 
        if (found) {
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        if (!lowest) {
                                ret = btrfs_bin_search(upper->eb, key,
                                                       upper->level, &slot);
+                               if (ret < 0) {
+                                       err = ret;
+                                       goto next;
+                               }
                                BUG_ON(ret);
                                bytenr = btrfs_node_blockptr(upper->eb, slot);
                                if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
                                               &slot);
+                       if (ret < 0) {
+                               err = ret;
+                               goto next;
+                       }
                        BUG_ON(ret);
                }
 
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        goto next;
                }
                btrfs_tree_lock(eb);
-               btrfs_set_lock_blocking(eb);
+               btrfs_set_lock_blocking_write(eb);
 
                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -4079,6 +4134,9 @@ restart:
                goto out_free;
        }
        btrfs_commit_transaction(trans);
+       ret = clean_dirty_subvols(rc);
+       if (ret < 0 && !err)
+               err = ret;
 out_free:
        btrfs_free_block_rsv(fs_info, rc->block_rsv);
        btrfs_free_path(path);
@@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
                return NULL;
 
        INIT_LIST_HEAD(&rc->reloc_roots);
+       INIT_LIST_HEAD(&rc->dirty_subvol_roots);
        backref_cache_init(&rc->backref_cache);
        mapping_tree_init(&rc->reloc_root_tree);
        extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                goto out_free;
        }
        err = btrfs_commit_transaction(trans);
+
+       ret = clean_dirty_subvols(rc);
+       if (ret < 0 && !err)
+               err = ret;
 out_free:
        kfree(rc);
 out:
index 65bda0682928babb4671a154fbad886badbc3e7f..0d2b957ca3a31f4aa44ccfd7c0fd43f006c5922a 100644 (file)
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
                                struct btrfs_root_item *item)
 {
        uuid_le uuid;
-       int len;
+       u32 len;
        int need_reset = 0;
 
        len = btrfs_item_size_nr(eb, slot);
        read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
-                       min_t(int, len, (int)sizeof(*item)));
+                          min_t(u32, len, sizeof(*item)));
        if (len < sizeof(*item))
                need_reset = 1;
        if (!need_reset && btrfs_root_generation(item)
index 6dcd36d7b84906bc4585c861ea7b55659d365fe3..a99588536c79e5fcbdbef4caf6a86852ea83205c 100644 (file)
@@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
        sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
        sctx->curr = -1;
        sctx->fs_info = fs_info;
+       INIT_LIST_HEAD(&sctx->csum_list);
        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
                struct scrub_bio *sbio;
 
@@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
        atomic_set(&sctx->workers_pending, 0);
        atomic_set(&sctx->cancel_req, 0);
        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-       INIT_LIST_HEAD(&sctx->csum_list);
 
        spin_lock_init(&sctx->list_lock);
        spin_lock_init(&sctx->stat_lock);
@@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
        unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
        int max_active = fs_info->thread_pool_size;
 
-       if (fs_info->scrub_workers_refcnt == 0) {
+       lockdep_assert_held(&fs_info->scrub_lock);
+
+       if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+               ASSERT(fs_info->scrub_workers == NULL);
                fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
                                flags, is_dev_replace ? 1 : max_active, 4);
                if (!fs_info->scrub_workers)
                        goto fail_scrub_workers;
 
+               ASSERT(fs_info->scrub_wr_completion_workers == NULL);
                fs_info->scrub_wr_completion_workers =
                        btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
                                              max_active, 2);
                if (!fs_info->scrub_wr_completion_workers)
                        goto fail_scrub_wr_completion_workers;
 
+               ASSERT(fs_info->scrub_parity_workers == NULL);
                fs_info->scrub_parity_workers =
                        btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
                                              max_active, 2);
                if (!fs_info->scrub_parity_workers)
                        goto fail_scrub_parity_workers;
+
+               refcount_set(&fs_info->scrub_workers_refcnt, 1);
+       } else {
+               refcount_inc(&fs_info->scrub_workers_refcnt);
        }
-       ++fs_info->scrub_workers_refcnt;
        return 0;
 
 fail_scrub_parity_workers:
@@ -3770,16 +3778,6 @@ fail_scrub_workers:
        return -ENOMEM;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
-       if (--fs_info->scrub_workers_refcnt == 0) {
-               btrfs_destroy_workqueue(fs_info->scrub_workers);
-               btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
-               btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
-       }
-       WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                    u64 end, struct btrfs_scrub_progress *progress,
                    int readonly, int is_dev_replace)
@@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        int ret;
        struct btrfs_device *dev;
        unsigned int nofs_flag;
+       struct btrfs_workqueue *scrub_workers = NULL;
+       struct btrfs_workqueue *scrub_wr_comp = NULL;
+       struct btrfs_workqueue *scrub_parity = NULL;
 
        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
@@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return PTR_ERR(sctx);
 
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
        if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
                     !is_dev_replace)) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
         */
        nofs_flag = memalloc_nofs_save();
        if (!is_dev_replace) {
+               btrfs_info(fs_info, "scrub: started on devid %llu", devid);
                /*
                 * by holding device list mutex, we can
                 * kick off writing super in log tree sync.
@@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        if (progress)
                memcpy(progress, &sctx->stat, sizeof(*progress));
 
+       if (!is_dev_replace)
+               btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+                       ret ? "not finished" : "finished", devid, ret);
+
        mutex_lock(&fs_info->scrub_lock);
        dev->scrub_ctx = NULL;
-       scrub_workers_put(fs_info);
+       if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+               scrub_workers = fs_info->scrub_workers;
+               scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+               scrub_parity = fs_info->scrub_parity_workers;
+
+               fs_info->scrub_workers = NULL;
+               fs_info->scrub_wr_completion_workers = NULL;
+               fs_info->scrub_parity_workers = NULL;
+       }
        mutex_unlock(&fs_info->scrub_lock);
 
+       btrfs_destroy_workqueue(scrub_workers);
+       btrfs_destroy_workqueue(scrub_wr_comp);
+       btrfs_destroy_workqueue(scrub_parity);
        scrub_put_ctx(sctx);
 
        return ret;
@@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
        struct scrub_ctx *sctx = NULL;
 
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
        if (dev)
                sctx = dev->scrub_ctx;
        if (sctx)
index 0a3f122dd61fe1858461c0c760b94b78412f3368..120e4340792aa363a848286f4f27aa89c1d05b1c 100644 (file)
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                                if (token != Opt_compress &&
                                    token != Opt_compress_force)
                                        info->compress_level =
-                                         btrfs_compress_str2level(args[0].from);
+                                         btrfs_compress_str2level(
+                                                       BTRFS_COMPRESS_ZLIB,
+                                                       args[0].from + 4);
                                btrfs_set_opt(info->mount_opt, COMPRESS);
                                btrfs_clear_opt(info->mount_opt, NODATACOW);
                                btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                                btrfs_clear_opt(info->mount_opt, NODATASUM);
                                btrfs_set_fs_incompat(info, COMPRESS_LZO);
                                no_compress = 0;
-                       } else if (strcmp(args[0].from, "zstd") == 0) {
+                       } else if (strncmp(args[0].from, "zstd", 4) == 0) {
                                compress_type = "zstd";
                                info->compress_type = BTRFS_COMPRESS_ZSTD;
+                               info->compress_level =
+                                       btrfs_compress_str2level(
+                                                        BTRFS_COMPRESS_ZSTD,
+                                                        args[0].from + 4);
                                btrfs_set_opt(info->mount_opt, COMPRESS);
                                btrfs_clear_opt(info->mount_opt, NODATACOW);
                                btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                ret = PTR_ERR_OR_ZERO(device);
                mutex_unlock(&uuid_mutex);
                break;
+       case BTRFS_IOC_FORGET_DEV:
+               ret = btrfs_forget_devices(vol->name);
+               break;
        case BTRFS_IOC_DEVICES_READY:
                mutex_lock(&uuid_mutex);
                device = btrfs_scan_one_device(vol->name, FMODE_READ,
index 4ec2b660d014fcef0b71650ff23bf140722368d1..acdad6d658f54bda7cf9c379867d212a41d1c24b 100644 (file)
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
                if (is_fstree(root->root_key.objectid))
                        btrfs_unpin_free_ino(root);
                clear_btree_io_tree(&root->dirty_log_pages);
+               btrfs_qgroup_clean_swapped_blocks(root);
        }
 
        /* We can free old roots now. */
@@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans);
        trans->block_rsv = NULL;
 
-       if (!list_empty(&trans->new_bgs))
-               btrfs_create_pending_block_groups(trans);
+       btrfs_create_pending_block_groups(trans);
 
        btrfs_trans_release_chunk_metadata(trans);
 
@@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                goto fail;
        }
 
-       btrfs_set_lock_blocking(old);
+       btrfs_set_lock_blocking_write(old);
 
        ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
        /* clean up in any case */
@@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
        cur_trans->delayed_refs.flushing = 1;
        smp_wmb();
 
-       if (!list_empty(&trans->new_bgs))
-               btrfs_create_pending_block_groups(trans);
+       btrfs_create_pending_block_groups(trans);
 
        ret = btrfs_run_delayed_refs(trans, 0);
        if (ret) {
index 3c0987ab587d5ec33f799b156fe30692f53c42a8..5f9e2dd413af9fbf77bf06f2d579e3cd930c29a7 100644 (file)
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                u32 nritems;
 
                root_node = btrfs_lock_root_node(root);
-               btrfs_set_lock_blocking(root_node);
+               btrfs_set_lock_blocking_write(root_node);
                nritems = btrfs_header_nritems(root_node);
                root->defrag_max.objectid = 0;
                /* from above we know this is not a leaf */
index ac232b3d6d7e2f3ba34d991ef8b2a17f96588f13..f06454a55e00cb4df0f71f03eb0013adbae1e4f4 100644 (file)
@@ -27,6 +27,7 @@
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
 #define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
 
 /*
  * directory trouble cases
@@ -1330,6 +1331,67 @@ out:
        return ret;
 }
 
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct inode *dir, struct inode *inode, const char *name,
+                   int namelen, u64 ref_index)
+{
+       struct btrfs_dir_item *dir_item;
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct inode *other_inode = NULL;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       dir_item = btrfs_lookup_dir_item(NULL, root, path,
+                                        btrfs_ino(BTRFS_I(dir)),
+                                        name, namelen, 0);
+       if (!dir_item) {
+               btrfs_release_path(path);
+               goto add_link;
+       } else if (IS_ERR(dir_item)) {
+               ret = PTR_ERR(dir_item);
+               goto out;
+       }
+
+       /*
+        * Our inode's dentry collides with the dentry of another inode which is
+        * in the log but not yet processed since it has a higher inode number.
+        * So delete that other dentry.
+        */
+       btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+       btrfs_release_path(path);
+       other_inode = read_one_inode(root, key.objectid);
+       if (!other_inode) {
+               ret = -ENOENT;
+               goto out;
+       }
+       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+                                name, namelen);
+       if (ret)
+               goto out;
+       /*
+        * If we dropped the link count to 0, bump it so that later the iput()
+        * on the inode will not free it. We will fixup the link count later.
+        */
+       if (other_inode->i_nlink == 0)
+               inc_nlink(other_inode);
+
+       ret = btrfs_run_delayed_items(trans);
+       if (ret)
+               goto out;
+add_link:
+       ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+                            name, namelen, 0, ref_index);
+out:
+       iput(other_inode);
+       btrfs_free_path(path);
+
+       return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                goto out;
 
                        /* insert our name */
-                       ret = btrfs_add_link(trans, BTRFS_I(dir),
-                                       BTRFS_I(inode),
-                                       name, namelen, 0, ref_index);
+                       ret = add_link(trans, root, dir, inode, name, namelen,
+                                      ref_index);
                        if (ret)
                                goto out;
 
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
                                if (trans) {
                                        btrfs_tree_lock(next);
-                                       btrfs_set_lock_blocking(next);
+                                       btrfs_set_lock_blocking_write(next);
                                        clean_tree_block(fs_info, next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
                                if (trans) {
                                        btrfs_tree_lock(next);
-                                       btrfs_set_lock_blocking(next);
+                                       btrfs_set_lock_blocking_write(next);
                                        clean_tree_block(fs_info, next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
                        if (trans) {
                                btrfs_tree_lock(next);
-                               btrfs_set_lock_blocking(next);
+                               btrfs_set_lock_blocking_write(next);
                                clean_tree_block(fs_info, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
@@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                found_key.type = 0;
                ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
                                       &start_slot);
+               if (ret < 0)
+                       break;
 
                ret = btrfs_del_items(trans, log, path, start_slot,
                                      path->slots[0] - start_slot + 1);
@@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
                                         struct btrfs_inode *inode,
-                                        u64 *other_ino)
+                                        u64 *other_ino, u64 *other_parent)
 {
        int ret;
        struct btrfs_path *search_path;
@@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
                                                  di, &di_key);
                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
-                               ret = 1;
-                               *other_ino = di_key.objectid;
+                               if (di_key.objectid != key->objectid) {
+                                       ret = 1;
+                                       *other_ino = di_key.objectid;
+                                       *other_parent = parent;
+                               } else {
+                                       ret = 0;
+                               }
                        } else {
                                ret = -EAGAIN;
                        }
@@ -4801,6 +4869,144 @@ out:
        return ret;
 }
 
+struct btrfs_ino_list {
+       u64 ino;
+       u64 parent;
+       struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_log_ctx *ctx,
+                                 u64 ino, u64 parent)
+{
+       struct btrfs_ino_list *ino_elem;
+       LIST_HEAD(inode_list);
+       int ret = 0;
+
+       ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+       if (!ino_elem)
+               return -ENOMEM;
+       ino_elem->ino = ino;
+       ino_elem->parent = parent;
+       list_add_tail(&ino_elem->list, &inode_list);
+
+       while (!list_empty(&inode_list)) {
+               struct btrfs_fs_info *fs_info = root->fs_info;
+               struct btrfs_key key;
+               struct inode *inode;
+
+               ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+                                           list);
+               ino = ino_elem->ino;
+               parent = ino_elem->parent;
+               list_del(&ino_elem->list);
+               kfree(ino_elem);
+               if (ret)
+                       continue;
+
+               btrfs_release_path(path);
+
+               key.objectid = ino;
+               key.type = BTRFS_INODE_ITEM_KEY;
+               key.offset = 0;
+               inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+               /*
+                * If the other inode that had a conflicting dir entry was
+                * deleted in the current transaction, we need to log its parent
+                * directory.
+                */
+               if (IS_ERR(inode)) {
+                       ret = PTR_ERR(inode);
+                       if (ret == -ENOENT) {
+                               key.objectid = parent;
+                               inode = btrfs_iget(fs_info->sb, &key, root,
+                                                  NULL);
+                               if (IS_ERR(inode)) {
+                                       ret = PTR_ERR(inode);
+                               } else {
+                                       ret = btrfs_log_inode(trans, root,
+                                                     BTRFS_I(inode),
+                                                     LOG_OTHER_INODE_ALL,
+                                                     0, LLONG_MAX, ctx);
+                                       iput(inode);
+                               }
+                       }
+                       continue;
+               }
+               /*
+                * We are safe logging the other inode without acquiring its
+                * lock as long as we log with the LOG_INODE_EXISTS mode. We
+                * are safe against concurrent renames of the other inode as
+                * well because during a rename we pin the log and update the
+                * log with the new name before we unpin it.
+                */
+               ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+                                     LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+               if (ret) {
+                       iput(inode);
+                       continue;
+               }
+
+               key.objectid = ino;
+               key.type = BTRFS_INODE_REF_KEY;
+               key.offset = 0;
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0) {
+                       iput(inode);
+                       continue;
+               }
+
+               while (true) {
+                       struct extent_buffer *leaf = path->nodes[0];
+                       int slot = path->slots[0];
+                       u64 other_ino = 0;
+                       u64 other_parent = 0;
+
+                       if (slot >= btrfs_header_nritems(leaf)) {
+                               ret = btrfs_next_leaf(root, path);
+                               if (ret < 0) {
+                                       break;
+                               } else if (ret > 0) {
+                                       ret = 0;
+                                       break;
+                               }
+                               continue;
+                       }
+
+                       btrfs_item_key_to_cpu(leaf, &key, slot);
+                       if (key.objectid != ino ||
+                           (key.type != BTRFS_INODE_REF_KEY &&
+                            key.type != BTRFS_INODE_EXTREF_KEY)) {
+                               ret = 0;
+                               break;
+                       }
+
+                       ret = btrfs_check_ref_name_override(leaf, slot, &key,
+                                       BTRFS_I(inode), &other_ino,
+                                       &other_parent);
+                       if (ret < 0)
+                               break;
+                       if (ret > 0) {
+                               ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+                               if (!ino_elem) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                               ino_elem->ino = other_ino;
+                               ino_elem->parent = other_parent;
+                               list_add_tail(&ino_elem->list, &inode_list);
+                               ret = 0;
+                       }
+                       path->slots[0]++;
+               }
+               iput(inode);
+       }
+
+       return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        u64 logged_isize = 0;
        bool need_log_inode_item = true;
        bool xattrs_logged = false;
+       bool recursive_logging = false;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       if (inode_only == LOG_OTHER_INODE) {
-               inode_only = LOG_INODE_EXISTS;
+       if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+               recursive_logging = true;
+               if (inode_only == LOG_OTHER_INODE)
+                       inode_only = LOG_INODE_EXISTS;
+               else
+                       inode_only = LOG_INODE_ALL;
                mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
        } else {
                mutex_lock(&inode->log_mutex);
@@ -4981,20 +5192,19 @@ again:
 
                if ((min_key.type == BTRFS_INODE_REF_KEY ||
                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
-                   inode->generation == trans->transid) {
+                   inode->generation == trans->transid &&
+                   !recursive_logging) {
                        u64 other_ino = 0;
+                       u64 other_parent = 0;
 
                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                        path->slots[0], &min_key, inode,
-                                       &other_ino);
+                                       &other_ino, &other_parent);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
                        } else if (ret > 0 && ctx &&
                                   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
-                               struct btrfs_key inode_key;
-                               struct inode *other_inode;
-
                                if (ins_nr > 0) {
                                        ins_nr++;
                                } else {
@@ -5010,43 +5220,13 @@ again:
                                        goto out_unlock;
                                }
                                ins_nr = 0;
-                               btrfs_release_path(path);
-                               inode_key.objectid = other_ino;
-                               inode_key.type = BTRFS_INODE_ITEM_KEY;
-                               inode_key.offset = 0;
-                               other_inode = btrfs_iget(fs_info->sb,
-                                                        &inode_key, root,
-                                                        NULL);
-                               /*
-                                * If the other inode that had a conflicting dir
-                                * entry was deleted in the current transaction,
-                                * we don't need to do more work nor fallback to
-                                * a transaction commit.
-                                */
-                               if (other_inode == ERR_PTR(-ENOENT)) {
-                                       goto next_key;
-                               } else if (IS_ERR(other_inode)) {
-                                       err = PTR_ERR(other_inode);
-                                       goto out_unlock;
-                               }
-                               /*
-                                * We are safe logging the other inode without
-                                * acquiring its i_mutex as long as we log with
-                                * the LOG_INODE_EXISTS mode. We're safe against
-                                * concurrent renames of the other inode as well
-                                * because during a rename we pin the log and
-                                * update the log with the new name before we
-                                * unpin it.
-                                */
-                               err = btrfs_log_inode(trans, root,
-                                               BTRFS_I(other_inode),
-                                               LOG_OTHER_INODE, 0, LLONG_MAX,
-                                               ctx);
-                               iput(other_inode);
+
+                               err = log_conflicting_inodes(trans, root, path,
+                                               ctx, other_ino, other_parent);
                                if (err)
                                        goto out_unlock;
-                               else
-                                       goto next_key;
+                               btrfs_release_path(path);
+                               goto next_key;
                        }
                }
 
index 15561926ab32cbc8c0ba8a4006fcb8a45e87e6d5..9024eee889b9838caa2799ca51f439106a955ff0 100644 (file)
@@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
        return dev;
 }
 
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
-               u64 devid, const u8 *uuid)
-{
-       struct btrfs_device *dev;
-
-       list_for_each_entry(dev, &fs_devices->devices, dev_list) {
-               if (dev->devid == devid &&
-                   (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
-                       return dev;
-               }
-       }
-       return NULL;
-}
-
 static noinline struct btrfs_fs_devices *find_fsid(
                const u8 *fsid, const u8 *metadata_fsid)
 {
@@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
        run_scheduled_bios(device);
 }
 
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+       int found;
+
+       rcu_read_lock();
+       found = strcmp(rcu_str_deref(device->name), path);
+       rcu_read_unlock();
+
+       return found == 0;
+}
+
 /*
  *  Search and remove all stale (devices which are not mounted) devices.
  *  When both inputs are NULL, it will search and release all stale devices.
@@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
  *             matching this path only.
  *  skip_dev:  Optional. Will skip this device when searching for the stale
  *             devices.
+ *  Return:    0 for success or if @path is NULL.
+ *             -EBUSY if @path is a mounted device.
+ *             -ENOENT if @path does not match any device in the list.
  */
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
                                     struct btrfs_device *skip_device)
 {
        struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
        struct btrfs_device *device, *tmp_device;
+       int ret = 0;
+
+       if (path)
+               ret = -ENOENT;
 
        list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
-               mutex_lock(&fs_devices->device_list_mutex);
-               if (fs_devices->opened) {
-                       mutex_unlock(&fs_devices->device_list_mutex);
-                       continue;
-               }
 
+               mutex_lock(&fs_devices->device_list_mutex);
                list_for_each_entry_safe(device, tmp_device,
                                         &fs_devices->devices, dev_list) {
-                       int not_found = 0;
-
                        if (skip_device && skip_device == device)
                                continue;
                        if (path && !device->name)
                                continue;
-
-                       rcu_read_lock();
-                       if (path)
-                               not_found = strcmp(rcu_str_deref(device->name),
-                                                  path);
-                       rcu_read_unlock();
-                       if (not_found)
+                       if (path && !device_path_matched(path, device))
                                continue;
+                       if (fs_devices->opened) {
+                               /* for an already deleted device return 0 */
+                               if (path && ret != 0)
+                                       ret = -EBUSY;
+                               break;
+                       }
 
                        /* delete the stale device */
                        fs_devices->num_devices--;
                        list_del(&device->dev_list);
                        btrfs_free_device(device);
 
+                       ret = 0;
                        if (fs_devices->num_devices == 0)
                                break;
                }
                mutex_unlock(&fs_devices->device_list_mutex);
+
                if (fs_devices->num_devices == 0) {
                        btrfs_sysfs_remove_fsid(fs_devices);
                        list_del(&fs_devices->fs_list);
                        free_fs_devices(fs_devices);
                }
        }
+
+       return ret;
 }
 
 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
                device = NULL;
        } else {
                mutex_lock(&fs_devices->device_list_mutex);
-               device = find_device(fs_devices, devid,
-                               disk_super->dev_item.uuid);
+               device = btrfs_find_device(fs_devices, devid,
+                               disk_super->dev_item.uuid, NULL, false);
 
                /*
                 * If this disk has been pulled into an fs devices created by
@@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        mutex_lock(&orig->device_list_mutex);
        fs_devices->total_devices = orig->total_devices;
 
-       /* We have held the volume lock, it is safe to get the devices. */
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                struct rcu_string *name;
 
@@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
        return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+       int ret;
+
+       mutex_lock(&uuid_mutex);
+       ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+       mutex_unlock(&uuid_mutex);
+
+       return ret;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
        devid = btrfs_stack_device_id(&disk_super->dev_item);
        dev_uuid = disk_super->dev_item.uuid;
        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
-               device = btrfs_find_device(fs_info, devid, dev_uuid,
-                               disk_super->metadata_uuid);
+               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+                                          disk_super->metadata_uuid, true);
        else
-               device = btrfs_find_device(fs_info, devid,
-                               dev_uuid, disk_super->fsid);
+               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+                                          disk_super->fsid, true);
 
        brelse(bh);
        if (!device)
@@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
        return device;
 }
 
-static struct btrfs_device *btrfs_find_device_missing_or_by_path(
-               struct btrfs_fs_info *fs_info, const char *device_path)
-{
-       struct btrfs_device *device = NULL;
-       if (strcmp(device_path, "missing") == 0) {
-               struct list_head *devices;
-               struct btrfs_device *tmp;
-
-               devices = &fs_info->fs_devices->devices;
-               list_for_each_entry(tmp, devices, dev_list) {
-                       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-                                       &tmp->dev_state) && !tmp->bdev) {
-                               device = tmp;
-                               break;
-                       }
-               }
-
-               if (!device)
-                       return ERR_PTR(-ENOENT);
-       } else {
-               device = btrfs_find_device_by_path(fs_info, device_path);
-       }
-
-       return device;
-}
-
 /*
  * Lookup a device given by device id, or the path if the id is 0.
  */
 struct btrfs_device *btrfs_find_device_by_devspec(
-               struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
+               struct btrfs_fs_info *fs_info, u64 devid,
+               const char *device_path)
 {
        struct btrfs_device *device;
 
        if (devid) {
-               device = btrfs_find_device(fs_info, devid, NULL, NULL);
+               device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+                                          NULL, true);
                if (!device)
                        return ERR_PTR(-ENOENT);
-       } else {
-               if (!devpath || !devpath[0])
-                       return ERR_PTR(-EINVAL);
-               device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
+               return device;
        }
-       return device;
+
+       if (!device_path || !device_path[0])
+               return ERR_PTR(-EINVAL);
+
+       if (strcmp(device_path, "missing") == 0) {
+               /* Find first missing device */
+               list_for_each_entry(device, &fs_info->fs_devices->devices,
+                                   dev_list) {
+                       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+                                    &device->dev_state) && !device->bdev)
+                               return device;
+               }
+               return ERR_PTR(-ENOENT);
+       }
+
+       return btrfs_find_device_by_path(fs_info, device_path);
 }
 
 /*
@@ -2563,7 +2556,8 @@ next_slot:
                                   BTRFS_UUID_SIZE);
                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                   BTRFS_FSID_SIZE);
-               device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+                                          fs_uuid, true);
                BUG_ON(!device); /* Logic error */
 
                if (device->fs_devices->seeding) {
@@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
        return BLK_STS_OK;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
-                                      u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+                                      u64 devid, u8 *uuid, u8 *fsid,
+                                      bool seed)
 {
        struct btrfs_device *device;
-       struct btrfs_fs_devices *cur_devices;
 
-       cur_devices = fs_info->fs_devices;
-       while (cur_devices) {
+       while (fs_devices) {
                if (!fsid ||
-                   !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
-                       device = find_device(cur_devices, devid, uuid);
-                       if (device)
-                               return device;
+                   !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+                       list_for_each_entry(device, &fs_devices->devices,
+                                           dev_list) {
+                               if (device->devid == devid &&
+                                   (!uuid || memcmp(device->uuid, uuid,
+                                                    BTRFS_UUID_SIZE) == 0))
+                                       return device;
+                       }
                }
-               cur_devices = cur_devices->seed;
+               if (seed)
+                       fs_devices = fs_devices->seed;
+               else
+                       return NULL;
        }
        return NULL;
 }
@@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
        }
 
        if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-           (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+           (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
            (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
            (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-           (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+           (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
            ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
             num_stripes != 1)) {
                btrfs_err(fs_info,
@@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
                read_extent_buffer(leaf, uuid, (unsigned long)
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(fs_info, devid,
-                                                       uuid, NULL);
+               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+                                                       devid, uuid, NULL, true);
                if (!map->stripes[i].dev &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        free_extent_map(em);
@@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
                        return PTR_ERR(fs_devices);
        }
 
-       device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+       device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+                                  fs_uuid, true);
        if (!device) {
                if (!btrfs_test_opt(fs_info, DEGRADED)) {
                        btrfs_report_missing_device(fs_info, devid,
@@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
        int i;
 
        mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+                               true);
        mutex_unlock(&fs_devices->device_list_mutex);
 
        if (!dev) {
@@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
        }
 
        /* Make sure no dev extent is beyond device bondary */
-       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
        if (!dev) {
                btrfs_err(fs_info, "failed to find devid %llu", devid);
                ret = -EUCLEAN;
@@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 
        /* It's possible this device is a dummy for seed device */
        if (dev->disk_total_bytes == 0) {
-               dev = find_device(fs_info->fs_devices->seed, devid, NULL);
+               dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+                                       NULL, false);
                if (!dev) {
                        btrfs_err(fs_info, "failed to find seed devid %llu",
                                  devid);
index ed806649a473f4557792f6c2df182c303207b4c7..3ad9d58d1b6618aea5d79220e48fdf98ac243360 100644 (file)
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                       fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
                                           fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
-                                      u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+                                      u64 devid, u8 *uuid, u8 *fsid, bool seed);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
 int btrfs_balance(struct btrfs_fs_info *fs_info,
index 970ff3e35bb345e598854e5767b8c0981bfbb15c..b86b7ad6b900a6d2785eb77283cf148cc3d3258f 100644 (file)
@@ -27,6 +27,33 @@ struct workspace {
        int level;
 };
 
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+       btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+       btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+       struct list_head *ws = btrfs_get_workspace(&wsm, level);
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+       workspace->level = level;
+
+       return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+       btrfs_put_workspace(&wsm, ws);
+}
+
 static void zlib_free_workspace(struct list_head *ws)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
        kfree(workspace);
 }
 
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
 {
        struct workspace *workspace;
        int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
        workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
                        zlib_inflate_workspacesize());
        workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+       workspace->level = level;
        workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!workspace->strm.workspace || !workspace->buf)
                goto fail;
@@ -390,18 +418,19 @@ next:
        return ret;
 }
 
-static void zlib_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zlib_set_level(unsigned int level)
 {
-       struct workspace *workspace = list_entry(ws, struct workspace, list);
-       unsigned level = (type & 0xF0) >> 4;
-
-       if (level > 9)
-               level = 9;
+       if (!level)
+               return BTRFS_ZLIB_DEFAULT_LEVEL;
 
-       workspace->level = level > 0 ? level : 3;
+       return min_t(unsigned int, level, 9);
 }
 
 const struct btrfs_compress_op btrfs_zlib_compress = {
+       .init_workspace_manager = zlib_init_workspace_manager,
+       .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+       .get_workspace          = zlib_get_workspace,
+       .put_workspace          = zlib_put_workspace,
        .alloc_workspace        = zlib_alloc_workspace,
        .free_workspace         = zlib_free_workspace,
        .compress_pages         = zlib_compress_pages,
index af6ec59972f5160e24c7499f3ea384b4ee298c9d..3e418a3aeb114297c443af142ee5340e2bf4af61 100644 (file)
@@ -6,25 +6,31 @@
  */
 
 #include <linux/bio.h>
+#include <linux/bitmap.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/zstd.h>
 #include "compression.h"
+#include "ctree.h"
 
 #define ZSTD_BTRFS_MAX_WINDOWLOG 17
 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
 #define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
 
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+                                                size_t src_len)
 {
-       ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
-                                               src_len, 0);
+       ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
 
        if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
                params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,290 @@ struct workspace {
        void *mem;
        size_t size;
        char *buf;
+       unsigned int level;
+       unsigned int req_level;
+       unsigned long last_used; /* jiffies */
        struct list_head list;
+       struct list_head lru_list;
        ZSTD_inBuffer in_buf;
        ZSTD_outBuffer out_buf;
 };
 
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru.  Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up.  This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee.  A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level.  Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES.  This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+       const struct btrfs_compress_op *ops;
+       spinlock_t lock;
+       struct list_head lru_list;
+       struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+       unsigned long active_map;
+       wait_queue_head_t wait;
+       struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+       return container_of(list, struct workspace, list);
+}
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+       unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+       struct list_head *pos, *next;
+
+       spin_lock(&wsm.lock);
+
+       if (list_empty(&wsm.lru_list)) {
+               spin_unlock(&wsm.lock);
+               return;
+       }
+
+       list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+               struct workspace *victim = container_of(pos, struct workspace,
+                                                       lru_list);
+               unsigned int level;
+
+               if (time_after(victim->last_used, reclaim_threshold))
+                       break;
+
+               /* workspace is in use */
+               if (victim->req_level)
+                       continue;
+
+               level = victim->level;
+               list_del(&victim->lru_list);
+               list_del(&victim->list);
+               wsm.ops->free_workspace(&victim->list);
+
+               if (list_empty(&wsm.idle_ws[level - 1]))
+                       clear_bit(level - 1, &wsm.active_map);
+
+       }
+
+       if (!list_empty(&wsm.lru_list))
+               mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+       spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace.  In order to reuse
+ * workspaces, this must be made a monotonic relationship.  This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+       size_t max_size = 0;
+       unsigned int level;
+
+       for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+               ZSTD_parameters params =
+                       zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+               size_t level_size =
+                       max_t(size_t,
+                             ZSTD_CStreamWorkspaceBound(params.cParams),
+                             ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+               max_size = max_t(size_t, max_size, level_size);
+               zstd_ws_mem_sizes[level - 1] = max_size;
+       }
+}
+
+static void zstd_init_workspace_manager(void)
+{
+       struct list_head *ws;
+       int i;
+
+       zstd_calc_ws_mem_sizes();
+
+       wsm.ops = &btrfs_zstd_compress;
+       spin_lock_init(&wsm.lock);
+       init_waitqueue_head(&wsm.wait);
+       timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+       INIT_LIST_HEAD(&wsm.lru_list);
+       for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+               INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+       ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+       if (IS_ERR(ws)) {
+               pr_warn(
+               "BTRFS: cannot preallocate zstd compression workspace\n");
+       } else {
+               set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+               list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+       }
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+       struct workspace *workspace;
+       int i;
+
+       del_timer(&wsm.timer);
+
+       for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+               while (!list_empty(&wsm.idle_ws[i])) {
+                       workspace = container_of(wsm.idle_ws[i].next,
+                                                struct workspace, list);
+                       list_del(&workspace->list);
+                       list_del(&workspace->lru_list);
+                       wsm.ops->free_workspace(&workspace->list);
+               }
+       }
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level.  This lets us utilize already allocated workspaces before
+ * allocating a new one.  If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated.  This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+       struct list_head *ws;
+       struct workspace *workspace;
+       int i = level - 1;
+
+       spin_lock(&wsm.lock);
+       for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+               if (!list_empty(&wsm.idle_ws[i])) {
+                       ws = wsm.idle_ws[i].next;
+                       workspace = list_to_workspace(ws);
+                       list_del_init(ws);
+                       /* keep its place if it's a lower level using this */
+                       workspace->req_level = level;
+                       if (level == workspace->level)
+                               list_del(&workspace->lru_list);
+                       if (list_empty(&wsm.idle_ws[i]))
+                               clear_bit(i, &wsm.active_map);
+                       spin_unlock(&wsm.lock);
+                       return ws;
+               }
+       }
+       spin_unlock(&wsm.lock);
+
+       return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used.  Therefore, we begin
+ * scanning from 1.  We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace.  If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+       struct list_head *ws;
+       unsigned int nofs_flag;
+
+       /* level == 0 means we can use any workspace */
+       if (!level)
+               level = 1;
+
+again:
+       ws = zstd_find_workspace(level);
+       if (ws)
+               return ws;
+
+       nofs_flag = memalloc_nofs_save();
+       ws = wsm.ops->alloc_workspace(level);
+       memalloc_nofs_restore(nofs_flag);
+
+       if (IS_ERR(ws)) {
+               DEFINE_WAIT(wait);
+
+               prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+               schedule();
+               finish_wait(&wsm.wait, &wait);
+
+               goto again;
+       }
+
+       return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level.  Here is where we continue to protect the
+ * max level workspace or update last_used accordingly.  If the reclaim timer
+ * isn't set, it is also set here.  Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+       struct workspace *workspace = list_to_workspace(ws);
+
+       spin_lock(&wsm.lock);
+
+       /* A node is only taken off the lru if we are the corresponding level */
+       if (workspace->req_level == workspace->level) {
+               /* Hide a max level workspace from reclaim */
+               if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+                       INIT_LIST_HEAD(&workspace->lru_list);
+               } else {
+                       workspace->last_used = jiffies;
+                       list_add(&workspace->lru_list, &wsm.lru_list);
+                       if (!timer_pending(&wsm.timer))
+                               mod_timer(&wsm.timer,
+                                         jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+               }
+       }
+
+       set_bit(workspace->level - 1, &wsm.active_map);
+       list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+       workspace->req_level = 0;
+
+       spin_unlock(&wsm.lock);
+
+       if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+               cond_wake_up(&wsm.wait);
+}
+
 static void zstd_free_workspace(struct list_head *ws)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
        kfree(workspace);
 }
 
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
 {
-       ZSTD_parameters params =
-                       zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
        struct workspace *workspace;
 
        workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
        if (!workspace)
                return ERR_PTR(-ENOMEM);
 
-       workspace->size = max_t(size_t,
-                       ZSTD_CStreamWorkspaceBound(params.cParams),
-                       ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+       workspace->size = zstd_ws_mem_sizes[level - 1];
+       workspace->level = level;
+       workspace->req_level = level;
+       workspace->last_used = jiffies;
        workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
        workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!workspace->mem || !workspace->buf)
                goto fail;
 
        INIT_LIST_HEAD(&workspace->list);
+       INIT_LIST_HEAD(&workspace->lru_list);
 
        return &workspace->list;
 fail:
@@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
        unsigned long len = *total_out;
        const unsigned long nr_dest_pages = *out_pages;
        unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-       ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+       ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+                                                          len);
 
        *out_pages = 0;
        *total_out = 0;
@@ -419,11 +705,19 @@ finish:
        return ret;
 }
 
-static void zstd_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zstd_set_level(unsigned int level)
 {
+       if (!level)
+               return ZSTD_BTRFS_DEFAULT_LEVEL;
+
+       return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
 }
 
 const struct btrfs_compress_op btrfs_zstd_compress = {
+       .init_workspace_manager = zstd_init_workspace_manager,
+       .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+       .get_workspace = zstd_get_workspace,
+       .put_workspace = zstd_put_workspace,
        .alloc_workspace = zstd_alloc_workspace,
        .free_workspace = zstd_free_workspace,
        .compress_pages = zstd_compress_pages,
index 3b8114def69309aabfced283d49cd7d81546b8b7..13318e255ebf7c4870cade0e26bdad640bc5265d 100644 (file)
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
        return (char *)p - base;
 }
 
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
-       [EXT2_FT_UNKNOWN]       = DT_UNKNOWN,
-       [EXT2_FT_REG_FILE]      = DT_REG,
-       [EXT2_FT_DIR]           = DT_DIR,
-       [EXT2_FT_CHRDEV]        = DT_CHR,
-       [EXT2_FT_BLKDEV]        = DT_BLK,
-       [EXT2_FT_FIFO]          = DT_FIFO,
-       [EXT2_FT_SOCK]          = DT_SOCK,
-       [EXT2_FT_SYMLINK]       = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
-       [S_IFREG >> S_SHIFT]    = EXT2_FT_REG_FILE,
-       [S_IFDIR >> S_SHIFT]    = EXT2_FT_DIR,
-       [S_IFCHR >> S_SHIFT]    = EXT2_FT_CHRDEV,
-       [S_IFBLK >> S_SHIFT]    = EXT2_FT_BLKDEV,
-       [S_IFIFO >> S_SHIFT]    = EXT2_FT_FIFO,
-       [S_IFSOCK >> S_SHIFT]   = EXT2_FT_SOCK,
-       [S_IFLNK >> S_SHIFT]    = EXT2_FT_SYMLINK,
-};
-
 static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
 {
-       umode_t mode = inode->i_mode;
        if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
-               de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+               de->file_type = fs_umode_to_ftype(inode->i_mode);
        else
                de->file_type = 0;
 }
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
        unsigned long n = pos >> PAGE_SHIFT;
        unsigned long npages = dir_pages(inode);
        unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
-       unsigned char *types = NULL;
        bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+       bool has_filetype;
 
        if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                return 0;
 
-       if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
-               types = ext2_filetype_table;
+       has_filetype =
+               EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
 
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
                        if (de->inode) {
                                unsigned char d_type = DT_UNKNOWN;
 
-                               if (types && de->file_type < EXT2_FT_MAX)
-                                       d_type = types[de->file_type];
+                               if (has_filetype)
+                                       d_type = fs_ftype_to_dtype(de->file_type);
 
                                if (!dir_emit(ctx, de->name, de->name_len,
                                                le32_to_cpu(de->inode),
index e770cd100a6ab2f3ac6ac66e28641167b024117b..10ab238de9a65e4df112e0f2869bd12e84f8287a 100644 (file)
@@ -603,22 +603,6 @@ struct ext2_dir_entry_2 {
        char    name[];                 /* File name, up to EXT2_NAME_LEN */
 };
 
-/*
- * Ext2 directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-enum {
-       EXT2_FT_UNKNOWN         = 0,
-       EXT2_FT_REG_FILE        = 1,
-       EXT2_FT_DIR             = 2,
-       EXT2_FT_CHRDEV          = 3,
-       EXT2_FT_BLKDEV          = 4,
-       EXT2_FT_FIFO            = 5,
-       EXT2_FT_SOCK            = 6,
-       EXT2_FT_SYMLINK         = 7,
-       EXT2_FT_MAX
-};
-
 /*
  * EXT2_DIR_PAD defines the directory entries boundaries
  *
@@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                       u64 start, u64 len);
index 28b2609f25c1c9502a813c49eacf71d7a7c24256..39c4772e96c9d5decd93d71a2c5303fd2af95917 100644 (file)
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
 #ifdef CONFIG_EXT2_FS_XATTR
        .listxattr      = ext2_listxattr,
 #endif
+       .getattr        = ext2_getattr,
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
        .set_acl        = ext2_set_acl,
index 5c3d7b7e49755ccbe22f65b607e8197d854df091..a0c5ea91fcd499d8676301eb578147dcd06a56f0 100644 (file)
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
                        best_desc = desc;
                }
        }
-       if (!best_desc)
-               return -1;
 
        return best_group;
 }
index e4bb9386c04551e1af155154213285c6da688531..c27c27300d95894c9de2ae0614e36c3df11bad47 100644 (file)
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
        /*
-        * Next look up the indirect map to count the totoal number of
+        * Next look up the indirect map to count the total number of
         * direct blocks to allocate for this branch.
         */
        count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
                                mark_inode_dirty(inode);
                                ext2_free_branches(inode, &nr, &nr+1, 1);
                        }
+                       /* fall through */
                case EXT2_IND_BLOCK:
                        nr = i_data[EXT2_DIND_BLOCK];
                        if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
                                mark_inode_dirty(inode);
                                ext2_free_branches(inode, &nr, &nr+1, 2);
                        }
+                       /* fall through */
                case EXT2_DIND_BLOCK:
                        nr = i_data[EXT2_TIND_BLOCK];
                        if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
+int ext2_getattr(const struct path *path, struct kstat *stat,
+               u32 request_mask, unsigned int query_falgs)
+{
+       struct inode *inode = d_inode(path->dentry);
+       struct ext2_inode_info *ei = EXT2_I(inode);
+       unsigned int flags;
+
+       flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+       if (flags & EXT2_APPEND_FL)
+               stat->attributes |= STATX_ATTR_APPEND;
+       if (flags & EXT2_COMPR_FL)
+               stat->attributes |= STATX_ATTR_COMPRESSED;
+       if (flags & EXT2_IMMUTABLE_FL)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (flags & EXT2_NODUMP_FL)
+               stat->attributes |= STATX_ATTR_NODUMP;
+       stat->attributes_mask |= (STATX_ATTR_APPEND |
+                       STATX_ATTR_COMPRESSED |
+                       STATX_ATTR_ENCRYPTED |
+                       STATX_ATTR_IMMUTABLE |
+                       STATX_ATTR_NODUMP);
+
+       generic_fillattr(inode, stat);
+       return 0;
+}
+
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = d_inode(dentry);
index 0c26dcc5d85014d57c6f73d153af9314c69e9ca9..ccfbbf59e2fc9cfb4986e8f0c312bb1cf7affb01 100644 (file)
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #ifdef CONFIG_EXT2_FS_XATTR
        .listxattr      = ext2_listxattr,
 #endif
+       .getattr        = ext2_getattr,
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
        .set_acl        = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
 #ifdef CONFIG_EXT2_FS_XATTR
        .listxattr      = ext2_listxattr,
 #endif
+       .getattr        = ext2_getattr,
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
        .set_acl        = ext2_set_acl,
index 73b2d528237f21012ccd8933167d599791cfeafb..0128010a0874e0e558056cd6b74b30cff21f69c8 100644 (file)
@@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits)
 {
        loff_t res = EXT2_NDIR_BLOCKS;
        int meta_blocks;
-       loff_t upper_limit;
+       unsigned int upper_limit;
+       unsigned int ppb = 1 << (bits-2);
 
        /* This is calculated to be the largest file size for a
         * dense, file such that the total number of
@@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits)
        /* total blocks in file system block size */
        upper_limit >>= (bits - 9);
 
+       /* Compute how many blocks we can address by block tree */
+       res += 1LL << (bits-2);
+       res += 1LL << (2*(bits-2));
+       res += 1LL << (3*(bits-2));
+       /* Does block tree limit file size? */
+       if (res < upper_limit)
+               goto check_lfs;
 
+       res = upper_limit;
+       /* How many metadata blocks are needed for addressing upper_limit? */
+       upper_limit -= EXT2_NDIR_BLOCKS;
        /* indirect blocks */
        meta_blocks = 1;
+       upper_limit -= ppb;
        /* double indirect blocks */
-       meta_blocks += 1 + (1LL << (bits-2));
-       /* tripple indirect blocks */
-       meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
-       upper_limit -= meta_blocks;
-       upper_limit <<= bits;
-
-       res += 1LL << (bits-2);
-       res += 1LL << (2*(bits-2));
-       res += 1LL << (3*(bits-2));
+       if (upper_limit < ppb * ppb) {
+               meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+               res -= meta_blocks;
+               goto check_lfs;
+       }
+       meta_blocks += 1 + ppb;
+       upper_limit -= ppb * ppb;
+       /* tripple indirect blocks for the rest */
+       meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+               DIV_ROUND_UP(upper_limit, ppb*ppb);
+       res -= meta_blocks;
+check_lfs:
        res <<= bits;
-       if (res > upper_limit)
-               res = upper_limit;
-
        if (res > MAX_LFS_FILESIZE)
                res = MAX_LFS_FILESIZE;
 
@@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 
-       if (EXT2_INODE_SIZE(sb) == 0)
-               goto cantfind_ext2;
        sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
                goto cantfind_ext2;
@@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                                           sizeof(struct buffer_head *),
                                           GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
+               ret = -ENOMEM;
                ext2_msg(sb, KERN_ERR, "error: not enough memory");
                goto failed_mount;
        }
        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
+               ret = -ENOMEM;
                ext2_msg(sb, KERN_ERR, "error: not enough memory");
                goto failed_mount_group_desc;
        }
@@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT2_FS_XATTR
        sbi->s_ea_block_cache = ext2_xattr_create_cache();
        if (!sbi->s_ea_block_cache) {
+               ret = -ENOMEM;
                ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
                goto failed_mount3;
        }
index d5589ddcc281fc8d872fae64a16c446da150d9c7..00cdb86794861d77b161f3297895c7cc11e0c7c0 100644 (file)
@@ -23,6 +23,7 @@
 
 const struct inode_operations ext2_symlink_inode_operations = {
        .get_link       = page_get_link,
+       .getattr        = ext2_getattr,
        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .listxattr      = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
        .get_link       = simple_get_link,
+       .getattr        = ext2_getattr,
        .setattr        = ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
        .listxattr      = ext2_listxattr,
index 4f30876ee325d26a91d63b3ffb1b19c4683b8579..1e33e0ac8cf1f8dbbaa8e364f5a6ca08a24d84ef 100644 (file)
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
                return;
 
        spin_lock(&EXT2_SB(sb)->s_lock);
+       ext2_update_dynamic_rev(sb);
        EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
        spin_unlock(&EXT2_SB(sb)->s_lock);
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644 (file)
index 0000000..78365e5
--- /dev/null
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+       [FT_UNKNOWN]    = DT_UNKNOWN,
+       [FT_REG_FILE]   = DT_REG,
+       [FT_DIR]        = DT_DIR,
+       [FT_CHRDEV]     = DT_CHR,
+       [FT_BLKDEV]     = DT_BLK,
+       [FT_FIFO]       = DT_FIFO,
+       [FT_SOCK]       = DT_SOCK,
+       [FT_SYMLINK]    = DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN                - Unknown type
+ * * DT_FIFO           - FIFO
+ * * DT_CHR            - Character device
+ * * DT_DIR            - Directory
+ * * DT_BLK            - Block device
+ * * DT_REG            - Regular file
+ * * DT_LNK            - Symbolic link
+ * * DT_SOCK           - Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+       if (filetype >= FT_MAX)
+               return DT_UNKNOWN;
+
+       return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+       [DT_REG]        = FT_REG_FILE,
+       [DT_DIR]        = FT_DIR,
+       [DT_LNK]        = FT_SYMLINK,
+       [DT_CHR]        = FT_CHRDEV,
+       [DT_BLK]        = FT_BLKDEV,
+       [DT_FIFO]       = FT_FIFO,
+       [DT_SOCK]       = FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN                - Unknown type
+ * * FT_REG_FILE       - Regular file
+ * * FT_DIR            - Directory
+ * * FT_CHRDEV         - Character device
+ * * FT_BLKDEV         - Block device
+ * * FT_FIFO           - FIFO
+ * * FT_SOCK           - Local-domain socket
+ * * FT_SYMLINK                - Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+       return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN                - Unknown type
+ * * DT_FIFO           - FIFO
+ * * DT_CHR            - Character device
+ * * DT_DIR            - Directory
+ * * DT_BLK            - Block device
+ * * DT_REG            - Regular file
+ * * DT_LNK            - Symbolic link
+ * * DT_SOCK           - Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+       return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
index d604f6b3bcc3162f9cec2900e46c94a21d9adc9f..0a8c5c27f90ece13c76f1a701c241ecbdbfc40a8 100644 (file)
@@ -2718,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
        if (unlikely(error == -ESTALE))
                error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
        if (likely(!error))
-               audit_inode(name, path->dentry, 0);
+               audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
        restore_nameidata();
        putname(name);
        return error;
index c4e83d94840cb296425fdc473aa0131c3e279179..98a8c182af4f04e0752752751d731802b02172a9 100644 (file)
@@ -1640,6 +1640,8 @@ int ksys_umount(char __user *name, int flags)
        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
 
+       lookup_flags |= LOOKUP_NO_EVAL;
+
        retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
        if (retval)
                goto out;
index 41355ce74ac0be6233e34d56d5eb040fcddb3112..735bfb2e9190eedb6ee68886cfc37033fb0abf57 100644 (file)
@@ -2,6 +2,7 @@ config FANOTIFY
        bool "Filesystem wide access notification"
        select FSNOTIFY
        select ANON_INODES
+       select EXPORTFS
        default n
        ---help---
           Say Y here to enable fanotify support.  fanotify is a file access
index 3723f3d18d2072dabc85c2c66d1afe8a4df3dc7e..6b9c27548997162420250a1dfc810b4d293e3436 100644 (file)
 #include <linux/wait.h>
 #include <linux/audit.h>
 #include <linux/sched/mm.h>
+#include <linux/statfs.h>
 
 #include "fanotify.h"
 
 static bool should_merge(struct fsnotify_event *old_fsn,
                         struct fsnotify_event *new_fsn)
 {
-       struct fanotify_event_info *old, *new;
+       struct fanotify_event *old, *new;
 
        pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
        old = FANOTIFY_E(old_fsn);
        new = FANOTIFY_E(new_fsn);
 
-       if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
-           old->path.mnt == new->path.mnt &&
-           old->path.dentry == new->path.dentry)
-               return true;
+       if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+           old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+               return false;
+
+       if (fanotify_event_has_path(old)) {
+               return old->path.mnt == new->path.mnt &&
+                       old->path.dentry == new->path.dentry;
+       } else if (fanotify_event_has_fid(old)) {
+               /*
+                * We want to merge many dirent events in the same dir (i.e.
+                * creates/unlinks/renames), but we do not want to merge dirent
+                * events referring to subdirs with dirent events referring to
+                * non subdirs, otherwise, user won't be able to tell from a
+                * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+                * unlink pair or rmdir+create pair of events.
+                */
+               return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+                       fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+       }
+
+       /* Do not merge events if we failed to encode fid */
        return false;
 }
 
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
 static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 {
        struct fsnotify_event *test_event;
+       struct fanotify_event *new;
 
        pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+       new = FANOTIFY_E(event);
 
        /*
         * Don't merge a permission event with any other event so that we know
         * the event structure we have created in fanotify_handle_event() is the
         * one we should check for permission response.
         */
-       if (fanotify_is_perm_event(event->mask))
+       if (fanotify_is_perm_event(new->mask))
                return 0;
 
        list_for_each_entry_reverse(test_event, list, list) {
                if (should_merge(test_event, event)) {
-                       test_event->mask |= event->mask;
+                       FANOTIFY_E(test_event)->mask |= new->mask;
                        return 1;
                }
        }
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
        return 0;
 }
 
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
 static int fanotify_get_response(struct fsnotify_group *group,
-                                struct fanotify_perm_event_info *event,
+                                struct fanotify_perm_event *event,
                                 struct fsnotify_iter_info *iter_info)
 {
        int ret;
 
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-       wait_event(group->fanotify_data.access_waitq, event->response);
+       ret = wait_event_killable(group->fanotify_data.access_waitq,
+                                 event->state == FAN_EVENT_ANSWERED);
+       /* Signal pending? */
+       if (ret < 0) {
+               spin_lock(&group->notification_lock);
+               /* Event reported to userspace and no answer yet? */
+               if (event->state == FAN_EVENT_REPORTED) {
+                       /* Event will get freed once userspace answers to it */
+                       event->state = FAN_EVENT_CANCELED;
+                       spin_unlock(&group->notification_lock);
+                       return ret;
+               }
+               /* Event not yet reported? Just remove it. */
+               if (event->state == FAN_EVENT_INIT)
+                       fsnotify_remove_queued_event(group, &event->fae.fse);
+               /*
+                * Event may be also answered in case signal delivery raced
+                * with wakeup. In that case we have nothing to do besides
+                * freeing the event and reporting error.
+                */
+               spin_unlock(&group->notification_lock);
+               goto out;
+       }
 
        /* userspace responded, convert to something usable */
        switch (event->response & ~FAN_AUDIT) {
@@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
        if (event->response & FAN_AUDIT)
                audit_fanotify(event->response & ~FAN_AUDIT);
 
-       event->response = 0;
-
        pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
                 group, event, ret);
-       
+out:
+       fsnotify_destroy_event(group, &event->fae.fse);
+
        return ret;
 }
 
@@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group,
  * been included within the event mask, but have not been explicitly
  * requested by the user, will not be present in the returned mask.
  */
-static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
-                                      u32 event_mask, const void *data,
-                                      int data_type)
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+                                    struct fsnotify_iter_info *iter_info,
+                                    u32 event_mask, const void *data,
+                                    int data_type)
 {
        __u32 marks_mask = 0, marks_ignored_mask = 0;
+       __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
        const struct path *path = data;
        struct fsnotify_mark *mark;
        int type;
@@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
        pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
                 __func__, iter_info->report_mask, event_mask, data, data_type);
 
-       /* If we don't have enough info to send an event to userspace say no */
-       if (data_type != FSNOTIFY_EVENT_PATH)
-               return 0;
-
-       /* Sorry, fanotify only gives a damn about files and dirs */
-       if (!d_is_reg(path->dentry) &&
-           !d_can_lookup(path->dentry))
-               return 0;
+       if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+               /* Do we have path to open a file descriptor? */
+               if (data_type != FSNOTIFY_EVENT_PATH)
+                       return 0;
+               /* Path type events are only relevant for files and dirs */
+               if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+                       return 0;
+       }
 
        fsnotify_foreach_obj_type(type) {
                if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
                marks_ignored_mask |= mark->ignored_mask;
        }
 
-       if (d_is_dir(path->dentry) &&
+       test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+       /*
+        * dirent modification events (create/delete/move) do not carry the
+        * child entry name/inode information. Instead, we report FAN_ONDIR
+        * for mkdir/rmdir so user can differentiate them from creat/unlink.
+        *
+        * For backward compatibility and consistency, do not report FAN_ONDIR
+        * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+        * to user in FAN_REPORT_FID mode for all event types.
+        */
+       if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+               /* Do not report FAN_ONDIR without any event */
+               if (!(test_mask & ~FAN_ONDIR))
+                       return 0;
+       } else {
+               user_mask &= ~FAN_ONDIR;
+       }
+
+       if (event_mask & FS_ISDIR &&
            !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
                return 0;
 
-       return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask &
-               ~marks_ignored_mask;
+       return test_mask & user_mask;
+}
+
+static int fanotify_encode_fid(struct fanotify_event *event,
+                              struct inode *inode, gfp_t gfp,
+                              __kernel_fsid_t *fsid)
+{
+       struct fanotify_fid *fid = &event->fid;
+       int dwords, bytes = 0;
+       int err, type;
+
+       fid->ext_fh = NULL;
+       dwords = 0;
+       err = -ENOENT;
+       type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+       if (!dwords)
+               goto out_err;
+
+       bytes = dwords << 2;
+       if (bytes > FANOTIFY_INLINE_FH_LEN) {
+               /* Treat failure to allocate fh as failure to allocate event */
+               err = -ENOMEM;
+               fid->ext_fh = kmalloc(bytes, gfp);
+               if (!fid->ext_fh)
+                       goto out_err;
+       }
+
+       type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+                                       &dwords, NULL);
+       err = -EINVAL;
+       if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+               goto out_err;
+
+       fid->fsid = *fsid;
+       event->fh_len = bytes;
+
+       return type;
+
+out_err:
+       pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+                           "type=%d, bytes=%d, err=%i)\n",
+                           fsid->val[0], fsid->val[1], type, bytes, err);
+       kfree(fid->ext_fh);
+       fid->ext_fh = NULL;
+       event->fh_len = 0;
+
+       return FILEID_INVALID;
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
-                                                struct inode *inode, u32 mask,
-                                                const struct path *path)
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+                                       const void *data, int data_type)
 {
-       struct fanotify_event_info *event = NULL;
+       if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+               return to_tell;
+       else if (data_type == FSNOTIFY_EVENT_INODE)
+               return (struct inode *)data;
+       else if (data_type == FSNOTIFY_EVENT_PATH)
+               return d_inode(((struct path *)data)->dentry);
+       return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+                                           struct inode *inode, u32 mask,
+                                           const void *data, int data_type,
+                                           __kernel_fsid_t *fsid)
+{
+       struct fanotify_event *event = NULL;
        gfp_t gfp = GFP_KERNEL_ACCOUNT;
+       struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
 
        /*
         * For queues with unlimited length lost events are not expected and
@@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
        memalloc_use_memcg(group->memcg);
 
        if (fanotify_is_perm_event(mask)) {
-               struct fanotify_perm_event_info *pevent;
+               struct fanotify_perm_event *pevent;
 
                pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
                if (!pevent)
                        goto out;
                event = &pevent->fae;
                pevent->response = 0;
+               pevent->state = FAN_EVENT_INIT;
                goto init;
        }
        event = kmem_cache_alloc(fanotify_event_cachep, gfp);
        if (!event)
                goto out;
 init: __maybe_unused
-       fsnotify_init_event(&event->fse, inode, mask);
+       fsnotify_init_event(&event->fse, inode);
+       event->mask = mask;
        if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
                event->pid = get_pid(task_pid(current));
        else
                event->pid = get_pid(task_tgid(current));
-       if (path) {
-               event->path = *path;
+       event->fh_len = 0;
+       if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+               /* Report the event without a file identifier on encode error */
+               event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+       } else if (data_type == FSNOTIFY_EVENT_PATH) {
+               event->fh_type = FILEID_ROOT;
+               event->path = *((struct path *)data);
                path_get(&event->path);
        } else {
+               event->fh_type = FILEID_INVALID;
                event->path.mnt = NULL;
                event->path.dentry = NULL;
        }
@@ -190,6 +335,29 @@ out:
        return event;
 }
 
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+       int type;
+       __kernel_fsid_t fsid = {};
+
+       fsnotify_foreach_obj_type(type) {
+               if (!fsnotify_iter_should_report_type(iter_info, type))
+                       continue;
+
+               fsid = iter_info->marks[type]->connector->fsid;
+               if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+                       continue;
+               return fsid;
+       }
+
+       return fsid;
+}
+
 static int fanotify_handle_event(struct fsnotify_group *group,
                                 struct inode *inode,
                                 u32 mask, const void *data, int data_type,
@@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group,
                                 struct fsnotify_iter_info *iter_info)
 {
        int ret = 0;
-       struct fanotify_event_info *event;
+       struct fanotify_event *event;
        struct fsnotify_event *fsn_event;
+       __kernel_fsid_t fsid = {};
 
        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+       BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+       BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+       BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+       BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+       BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+       BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+       BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
        BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
        BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
 
-       BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
+       BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
 
-       mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
+       mask = fanotify_group_event_mask(group, iter_info, mask, data,
+                                        data_type);
        if (!mask)
                return 0;
 
@@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
                        return 0;
        }
 
-       event = fanotify_alloc_event(group, inode, mask, data);
+       if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+               fsid = fanotify_get_fsid(iter_info);
+
+       event = fanotify_alloc_event(group, inode, mask, data, data_type,
+                                    &fsid);
        ret = -ENOMEM;
        if (unlikely(!event)) {
                /*
@@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
        } else if (fanotify_is_perm_event(mask)) {
                ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
                                            iter_info);
-               fsnotify_destroy_event(group, fsn_event);
        }
 finish:
        if (fanotify_is_perm_event(mask))
@@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
 
 static void fanotify_free_event(struct fsnotify_event *fsn_event)
 {
-       struct fanotify_event_info *event;
+       struct fanotify_event *event;
 
        event = FANOTIFY_E(fsn_event);
-       path_put(&event->path);
+       if (fanotify_event_has_path(event))
+               path_put(&event->path);
+       else if (fanotify_event_has_ext_fh(event))
+               kfree(event->fid.ext_fh);
        put_pid(event->pid);
-       if (fanotify_is_perm_event(fsn_event->mask)) {
+       if (fanotify_is_perm_event(event->mask)) {
                kmem_cache_free(fanotify_perm_event_cachep,
                                FANOTIFY_PE(fsn_event));
                return;
index ea05b8a401e79dc9d9322a2105407bccf56fa834..68b30504284c0c30d6bb87f7905c24dcd6f9c4e9 100644 (file)
 #include <linux/fsnotify_backend.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/exportfs.h>
 
 extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_event_cachep;
 extern struct kmem_cache *fanotify_perm_event_cachep;
 
+/* Possible states of the permission event */
+enum {
+       FAN_EVENT_INIT,
+       FAN_EVENT_REPORTED,
+       FAN_EVENT_ANSWERED,
+       FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN (3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN (4 << 2)
+#endif
+
+struct fanotify_fid {
+       __kernel_fsid_t fsid;
+       union {
+               unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+               unsigned char *ext_fh;
+       };
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+                                   unsigned int fh_len)
+{
+       return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+                                     struct fanotify_fid *fid2,
+                                     unsigned int fh_len)
+{
+       return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+               fid1->fsid.val[1] == fid2->fsid.val[1] &&
+               !memcmp(fanotify_fid_fh(fid1, fh_len),
+                       fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
 /*
  * Structure for normal fanotify events. It gets allocated in
  * fanotify_handle_event() and freed when the information is retrieved by
  * userspace
  */
-struct fanotify_event_info {
+struct fanotify_event {
        struct fsnotify_event fse;
+       u32 mask;
        /*
-        * We hold ref to this path so it may be dereferenced at any point
-        * during this object's lifetime
+        * Those fields are outside fanotify_fid to pack fanotify_event nicely
+        * on 64bit arch and to use fh_type as an indication of whether path
+        * or fid are used in the union:
+        * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
         */
-       struct path path;
+       u8 fh_type;
+       u8 fh_len;
+       u16 pad;
+       union {
+               /*
+                * We hold ref to this path so it may be dereferenced at any
+                * point during this object's lifetime
+                */
+               struct path path;
+               /*
+                * With FAN_REPORT_FID, we do not hold any reference on the
+                * victim object. Instead we store its NFS file handle and its
+                * filesystem's fsid as a unique identifier.
+                */
+               struct fanotify_fid fid;
+       };
        struct pid *pid;
 };
 
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+       return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+       return event->fh_type != FILEID_ROOT &&
+               event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+       return fanotify_event_has_fid(event) &&
+               event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+       return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
 /*
  * Structure for permission fanotify events. It gets allocated and freed in
  * fanotify_handle_event() since we wait there for user response. When the
@@ -29,16 +115,17 @@ struct fanotify_event_info {
  * group->notification_list to group->fanotify_data.access_list to wait for
  * user response.
  */
-struct fanotify_perm_event_info {
-       struct fanotify_event_info fae;
-       int response;   /* userspace answer to question */
+struct fanotify_perm_event {
+       struct fanotify_event fae;
+       unsigned short response;        /* userspace answer to the event */
+       unsigned short state;           /* state of the event */
        int fd;         /* fd we passed to userspace for this event */
 };
 
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
 FANOTIFY_PE(struct fsnotify_event *fse)
 {
-       return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+       return container_of(fse, struct fanotify_perm_event, fae.fse);
 }
 
 static inline bool fanotify_is_perm_event(u32 mask)
@@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask)
                mask & FANOTIFY_PERM_EVENTS;
 }
 
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
 {
-       return container_of(fse, struct fanotify_event_info, fse);
+       return container_of(fse, struct fanotify_event, fse);
 }
 
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
-                                                struct inode *inode, u32 mask,
-                                                const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+                                           struct inode *inode, u32 mask,
+                                           const void *data, int data_type,
+                                           __kernel_fsid_t *fsid);
index 9c870b0d2b560fd84f885e19ba95ad6e97bd32ea..56992b32c6bbb63839bbc95a50cb83a57398d8bb 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 #include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
 
 #include <asm/ioctls.h>
 
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
 struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+       if (!fanotify_event_has_fid(event))
+               return 0;
+
+       return roundup(sizeof(struct fanotify_event_info_fid) +
+                      sizeof(struct file_handle) + event->fh_len,
+                      FANOTIFY_EVENT_ALIGN);
+}
+
 /*
  * Get an fsnotify notification event if one exists and is small
  * enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
  */
 static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
                                            size_t count)
 {
-       assert_spin_locked(&group->notification_lock);
+       size_t event_size = FAN_EVENT_METADATA_LEN;
+       struct fsnotify_event *fsn_event = NULL;
 
        pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
+       spin_lock(&group->notification_lock);
        if (fsnotify_notify_queue_is_empty(group))
-               return NULL;
+               goto out;
 
-       if (FAN_EVENT_METADATA_LEN > count)
-               return ERR_PTR(-EINVAL);
+       if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+               event_size += fanotify_event_info_len(
+                       FANOTIFY_E(fsnotify_peek_first_event(group)));
+       }
 
-       /* held the notification_lock the whole time, so this is the
-        * same event we peeked above */
-       return fsnotify_remove_first_event(group);
+       if (event_size > count) {
+               fsn_event = ERR_PTR(-EINVAL);
+               goto out;
+       }
+       fsn_event = fsnotify_remove_first_event(group);
+       if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+               FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+       spin_unlock(&group->notification_lock);
+       return fsn_event;
 }
 
 static int create_fd(struct fsnotify_group *group,
-                    struct fanotify_event_info *event,
+                    struct fanotify_event *event,
                     struct file **file)
 {
        int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
        return client_fd;
 }
 
-static int fill_event_metadata(struct fsnotify_group *group,
-                              struct fanotify_event_metadata *metadata,
-                              struct fsnotify_event *fsn_event,
-                              struct file **file)
-{
-       int ret = 0;
-       struct fanotify_event_info *event;
-
-       pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
-                group, metadata, fsn_event);
-
-       *file = NULL;
-       event = container_of(fsn_event, struct fanotify_event_info, fse);
-       metadata->event_len = FAN_EVENT_METADATA_LEN;
-       metadata->metadata_len = FAN_EVENT_METADATA_LEN;
-       metadata->vers = FANOTIFY_METADATA_VERSION;
-       metadata->reserved = 0;
-       metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
-       metadata->pid = pid_vnr(event->pid);
-       if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
-               metadata->fd = FAN_NOFD;
-       else {
-               metadata->fd = create_fd(group, event, file);
-               if (metadata->fd < 0)
-                       ret = metadata->fd;
-       }
-
-       return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
-                               struct fsnotify_group *group, int fd)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+                                   struct fanotify_perm_event *event,
+                                   unsigned int response)
+                                   __releases(&group->notification_lock)
 {
-       struct fanotify_perm_event_info *event, *return_e = NULL;
-
-       spin_lock(&group->notification_lock);
-       list_for_each_entry(event, &group->fanotify_data.access_list,
-                           fae.fse.list) {
-               if (event->fd != fd)
-                       continue;
+       bool destroy = false;
 
-               list_del_init(&event->fae.fse.list);
-               return_e = event;
-               break;
-       }
+       assert_spin_locked(&group->notification_lock);
+       event->response = response;
+       if (event->state == FAN_EVENT_CANCELED)
+               destroy = true;
+       else
+               event->state = FAN_EVENT_ANSWERED;
        spin_unlock(&group->notification_lock);
-
-       pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
-       return return_e;
+       if (destroy)
+               fsnotify_destroy_event(group, &event->fae.fse);
 }
 
 static int process_access_response(struct fsnotify_group *group,
                                   struct fanotify_response *response_struct)
 {
-       struct fanotify_perm_event_info *event;
+       struct fanotify_perm_event *event;
        int fd = response_struct->fd;
        int response = response_struct->response;
 
@@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group,
        if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
                return -EINVAL;
 
-       event = dequeue_event(group, fd);
-       if (!event)
-               return -ENOENT;
+       spin_lock(&group->notification_lock);
+       list_for_each_entry(event, &group->fanotify_data.access_list,
+                           fae.fse.list) {
+               if (event->fd != fd)
+                       continue;
 
-       event->response = response;
-       wake_up(&group->fanotify_data.access_waitq);
+               list_del_init(&event->fae.fse.list);
+               finish_permission_event(group, event, response);
+               wake_up(&group->fanotify_data.access_waitq);
+               return 0;
+       }
+       spin_unlock(&group->notification_lock);
+
+       return -ENOENT;
+}
+
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+       struct fanotify_event_info_fid info = { };
+       struct file_handle handle = { };
+       size_t fh_len = event->fh_len;
+       size_t len = fanotify_event_info_len(event);
+
+       if (!len)
+               return 0;
+
+       if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+               return -EFAULT;
+
+       /* Copy event info fid header followed by vaiable sized file handle */
+       info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+       info.hdr.len = len;
+       info.fsid = event->fid.fsid;
+       if (copy_to_user(buf, &info, sizeof(info)))
+               return -EFAULT;
+
+       buf += sizeof(info);
+       len -= sizeof(info);
+       handle.handle_type = event->fh_type;
+       handle.handle_bytes = fh_len;
+       if (copy_to_user(buf, &handle, sizeof(handle)))
+               return -EFAULT;
+
+       buf += sizeof(handle);
+       len -= sizeof(handle);
+       if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+               return -EFAULT;
+
+       /* Pad with 0's */
+       buf += fh_len;
+       len -= fh_len;
+       WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+       if (len > 0 && clear_user(buf, len))
+               return -EFAULT;
 
        return 0;
 }
 
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
-                                 struct fsnotify_event *event,
+                                 struct fsnotify_event *fsn_event,
                                  char __user *buf, size_t count)
 {
-       struct fanotify_event_metadata fanotify_event_metadata;
-       struct file *f;
-       int fd, ret;
-
-       pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
-       ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
-       if (ret < 0)
-               return ret;
+       struct fanotify_event_metadata metadata;
+       struct fanotify_event *event;
+       struct file *f = NULL;
+       int ret, fd = FAN_NOFD;
+
+       pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+       event = container_of(fsn_event, struct fanotify_event, fse);
+       metadata.event_len = FAN_EVENT_METADATA_LEN;
+       metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+       metadata.vers = FANOTIFY_METADATA_VERSION;
+       metadata.reserved = 0;
+       metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+       metadata.pid = pid_vnr(event->pid);
+
+       if (fanotify_event_has_path(event)) {
+               fd = create_fd(group, event, &f);
+               if (fd < 0)
+                       return fd;
+       } else if (fanotify_event_has_fid(event)) {
+               metadata.event_len += fanotify_event_info_len(event);
+       }
+       metadata.fd = fd;
 
-       fd = fanotify_event_metadata.fd;
        ret = -EFAULT;
        /*
         * Sanity check copy size in case get_one_event() and
         * fill_event_metadata() event_len sizes ever get out of sync.
         */
-       if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count))
+       if (WARN_ON_ONCE(metadata.event_len > count))
                goto out_close_fd;
-       if (copy_to_user(buf, &fanotify_event_metadata,
-                        fanotify_event_metadata.event_len))
+
+       if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
                goto out_close_fd;
 
        if (fanotify_is_perm_event(event->mask))
-               FANOTIFY_PE(event)->fd = fd;
+               FANOTIFY_PE(fsn_event)->fd = fd;
 
-       if (fd != FAN_NOFD)
+       if (fanotify_event_has_path(event)) {
                fd_install(fd, f);
-       return fanotify_event_metadata.event_len;
+       } else if (fanotify_event_has_fid(event)) {
+               ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return metadata.event_len;
 
 out_close_fd:
        if (fd != FAN_NOFD) {
@@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 
        add_wait_queue(&group->notification_waitq, &wait);
        while (1) {
-               spin_lock(&group->notification_lock);
                kevent = get_one_event(group, count);
-               spin_unlock(&group->notification_lock);
-
                if (IS_ERR(kevent)) {
                        ret = PTR_ERR(kevent);
                        break;
@@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                 * Permission events get queued to wait for response.  Other
                 * events can be destroyed now.
                 */
-               if (!fanotify_is_perm_event(kevent->mask)) {
+               if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
                        fsnotify_destroy_event(group, kevent);
                } else {
                        if (ret <= 0) {
-                               FANOTIFY_PE(kevent)->response = FAN_DENY;
+                               spin_lock(&group->notification_lock);
+                               finish_permission_event(group,
+                                       FANOTIFY_PE(kevent), FAN_DENY);
                                wake_up(&group->fanotify_data.access_waitq);
                        } else {
                                spin_lock(&group->notification_lock);
@@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
-       struct fanotify_perm_event_info *event, *next;
+       struct fanotify_perm_event *event;
        struct fsnotify_event *fsn_event;
 
        /*
@@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
         * and simulate reply from userspace.
         */
        spin_lock(&group->notification_lock);
-       list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
-                                fae.fse.list) {
-               pr_debug("%s: found group=%p event=%p\n", __func__, group,
-                        event);
-
+       while (!list_empty(&group->fanotify_data.access_list)) {
+               event = list_first_entry(&group->fanotify_data.access_list,
+                               struct fanotify_perm_event, fae.fse.list);
                list_del_init(&event->fae.fse.list);
-               event->response = FAN_ALLOW;
+               finish_permission_event(group, event, FAN_ALLOW);
+               spin_lock(&group->notification_lock);
        }
 
        /*
@@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
         */
        while (!fsnotify_notify_queue_is_empty(group)) {
                fsn_event = fsnotify_remove_first_event(group);
-               if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) {
+               if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
                        spin_unlock(&group->notification_lock);
                        fsnotify_destroy_event(group, fsn_event);
-                       spin_lock(&group->notification_lock);
                } else {
-                       FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+                       finish_permission_event(group, FANOTIFY_PE(fsn_event),
+                                               FAN_ALLOW);
                }
+               spin_lock(&group->notification_lock);
        }
        spin_unlock(&group->notification_lock);
 
@@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
                                                   fsnotify_connp_t *connp,
-                                                  unsigned int type)
+                                                  unsigned int type,
+                                                  __kernel_fsid_t *fsid)
 {
        struct fsnotify_mark *mark;
        int ret;
@@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
                return ERR_PTR(-ENOMEM);
 
        fsnotify_init_mark(mark, group);
-       ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+       ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
        if (ret) {
                fsnotify_put_mark(mark);
                return ERR_PTR(ret);
@@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 
 static int fanotify_add_mark(struct fsnotify_group *group,
                             fsnotify_connp_t *connp, unsigned int type,
-                            __u32 mask, unsigned int flags)
+                            __u32 mask, unsigned int flags,
+                            __kernel_fsid_t *fsid)
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
@@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_mark(connp, group);
        if (!fsn_mark) {
-               fsn_mark = fanotify_add_new_mark(group, connp, type);
+               fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
                if (IS_ERR(fsn_mark)) {
                        mutex_unlock(&group->mark_mutex);
                        return PTR_ERR(fsn_mark);
@@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                                      struct vfsmount *mnt, __u32 mask,
-                                     unsigned int flags)
+                                     unsigned int flags, __kernel_fsid_t *fsid)
 {
        return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
-                                FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+                                FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
 }
 
 static int fanotify_add_sb_mark(struct fsnotify_group *group,
-                                     struct super_block *sb, __u32 mask,
-                                     unsigned int flags)
+                               struct super_block *sb, __u32 mask,
+                               unsigned int flags, __kernel_fsid_t *fsid)
 {
        return fanotify_add_mark(group, &sb->s_fsnotify_marks,
-                                FSNOTIFY_OBJ_TYPE_SB, mask, flags);
+                                FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
                                   struct inode *inode, __u32 mask,
-                                  unsigned int flags)
+                                  unsigned int flags, __kernel_fsid_t *fsid)
 {
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
                return 0;
 
        return fanotify_add_mark(group, &inode->i_fsnotify_marks,
-                                FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+                                FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
 }
 
 /* fanotify syscalls */
@@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        struct fsnotify_group *group;
        int f_flags, fd;
        struct user_struct *user;
-       struct fanotify_event_info *oevent;
+       struct fanotify_event *oevent;
 
        pr_debug("%s: flags=%x event_f_flags=%x\n",
                 __func__, flags, event_f_flags);
@@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
                return -EINVAL;
        }
 
+       if ((flags & FAN_REPORT_FID) &&
+           (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+               return -EINVAL;
+
        user = get_current_user();
        if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
                free_uid(user);
@@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        atomic_inc(&user->fanotify_listeners);
        group->memcg = get_mem_cgroup_from_mm(current->mm);
 
-       oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+       oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+                                     FSNOTIFY_EVENT_NONE, NULL);
        if (unlikely(!oevent)) {
                fd = -ENOMEM;
                goto out_destroy_group;
@@ -801,6 +868,48 @@ out_destroy_group:
        return fd;
 }
 
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+       __kernel_fsid_t root_fsid;
+       int err;
+
+       /*
+        * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+        */
+       err = vfs_get_fsid(path->dentry, fsid);
+       if (err)
+               return err;
+
+       if (!fsid->val[0] && !fsid->val[1])
+               return -ENODEV;
+
+       /*
+        * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+        * which uses a different fsid than sb root.
+        */
+       err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+       if (err)
+               return err;
+
+       if (root_fsid.val[0] != fsid->val[0] ||
+           root_fsid.val[1] != fsid->val[1])
+               return -EXDEV;
+
+       /*
+        * We need to make sure that the file system supports at least
+        * encoding a file handle so user can use name_to_handle_at() to
+        * compare fid returned with event to the file handle of watched
+        * objects. However, name_to_handle_at() requires that the
+        * filesystem also supports decoding file handles.
+        */
+       if (!path->dentry->d_sb->s_export_op ||
+           !path->dentry->d_sb->s_export_op->fh_to_dentry)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
                            int dfd, const char  __user *pathname)
 {
@@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
        struct fsnotify_group *group;
        struct fd f;
        struct path path;
+       __kernel_fsid_t __fsid, *fsid = NULL;
        u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
        unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
        int ret;
@@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
            group->priority == FS_PRIO_0)
                goto fput_and_out;
 
+       /*
+        * Events with data type inode do not carry enough information to report
+        * event->fd, so we do not allow setting a mask for inode events unless
+        * group supports reporting fid.
+        * inode events are not supported on a mount mark, because they do not
+        * carry enough information (i.e. path) to be filtered by mount point.
+        */
+       if (mask & FANOTIFY_INODE_EVENTS &&
+           (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+            mark_type == FAN_MARK_MOUNT))
+               goto fput_and_out;
+
        if (flags & FAN_MARK_FLUSH) {
                ret = 0;
                if (mark_type == FAN_MARK_MOUNT)
@@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
        if (ret)
                goto fput_and_out;
 
+       if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+               ret = fanotify_test_fid(&path, &__fsid);
+               if (ret)
+                       goto path_put_and_out;
+
+               fsid = &__fsid;
+       }
+
        /* inode held in place by reference to path; group by fget on fd */
        if (mark_type == FAN_MARK_INODE)
                inode = path.dentry->d_inode;
@@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
        case FAN_MARK_ADD:
                if (mark_type == FAN_MARK_MOUNT)
-                       ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+                       ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+                                                        flags, fsid);
                else if (mark_type == FAN_MARK_FILESYSTEM)
-                       ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags);
+                       ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+                                                  flags, fsid);
                else
-                       ret = fanotify_add_inode_mark(group, inode, mask, flags);
+                       ret = fanotify_add_inode_mark(group, inode, mask,
+                                                     flags, fsid);
                break;
        case FAN_MARK_REMOVE:
                if (mark_type == FAN_MARK_MOUNT)
-                       ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+                       ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+                                                           flags);
                else if (mark_type == FAN_MARK_FILESYSTEM)
-                       ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags);
+                       ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+                                                     flags);
                else
-                       ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+                       ret = fanotify_remove_inode_mark(group, inode, mask,
+                                                        flags);
                break;
        default:
                ret = -EINVAL;
        }
 
+path_put_and_out:
        path_put(&path);
 fput_and_out:
        fdput(f);
@@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
  */
 static int __init fanotify_user_setup(void)
 {
-       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
        BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
                                         SLAB_PANIC|SLAB_ACCOUNT);
-       fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+       fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
        if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
                fanotify_perm_event_cachep =
-                       KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+                       KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
        }
 
        return 0;
index ecf09b6243d926b6ed7f6f474aca69f526c3824a..df06f3da166c1e64160e0d7c90811a17863d9fbd 100644 (file)
@@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
             const unsigned char *file_name, u32 cookie)
 {
        struct fsnotify_iter_info iter_info = {};
-       struct super_block *sb = NULL;
+       struct super_block *sb = to_tell->i_sb;
        struct mount *mnt = NULL;
-       __u32 mnt_or_sb_mask = 0;
+       __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
        int ret = 0;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
 
        if (data_is == FSNOTIFY_EVENT_PATH) {
                mnt = real_mount(((const struct path *)data)->mnt);
-               sb = mnt->mnt.mnt_sb;
-               mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
+               mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
        }
        /* An event "on child" is not intended for a mount/sb mark */
        if (mask & FS_EVENT_ON_CHILD)
@@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
-       if (!to_tell->i_fsnotify_marks &&
-           (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks)))
+       if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
+           (!mnt || !mnt->mnt_fsnotify_marks))
                return 0;
        /*
         * if this is a modify event we may need to clear the ignored masks
@@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 
        iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
                fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+       iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+               fsnotify_first_mark(&sb->s_fsnotify_marks);
        if (mnt) {
                iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
-               iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
-                       fsnotify_first_mark(&sb->s_fsnotify_marks);
        }
 
        /*
index 7e4578d35b613ce97b87a53e50b2b34df0561e37..74ae6030518989584a4471b07dc1e57ecd25cb9f 100644 (file)
@@ -5,6 +5,7 @@
 
 struct inotify_event_info {
        struct fsnotify_event fse;
+       u32 mask;
        int wd;
        u32 sync_cookie;
        int name_len;
index f4184b4f38154443816a61e89d461f352e83d76f..ff30abd6a49b1ad6d93e9ddc3b54c3b7c5ab2d74 100644 (file)
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
 {
        struct inotify_event_info *old, *new;
 
-       if (old_fsn->mask & FS_IN_IGNORED)
-               return false;
        old = INOTIFY_E(old_fsn);
        new = INOTIFY_E(new_fsn);
-       if ((old_fsn->mask == new_fsn->mask) &&
+       if (old->mask & FS_IN_IGNORED)
+               return false;
+       if ((old->mask == new->mask) &&
            (old_fsn->inode == new_fsn->inode) &&
            (old->name_len == new->name_len) &&
            (!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
                return -ENOMEM;
        }
 
+       /*
+        * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+        * for fanotify. inotify never reported IN_ISDIR with those events.
+        * It looks like an oversight, but to avoid the risk of breaking
+        * existing inotify programs, mask the flag out from those events.
+        */
+       if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+               mask &= ~IN_ISDIR;
+
        fsn_event = &event->fse;
-       fsnotify_init_event(fsn_event, inode, mask);
+       fsnotify_init_event(fsn_event, inode);
+       event->mask = mask;
        event->wd = i_mark->wd;
        event->sync_cookie = cookie;
        event->name_len = len;
index 798f1253141aee87fe11a1106ada42d33ccaadd2..e2901fbb9f76c97a7abe1607f5aa51f53a842966 100644 (file)
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
         */
        pad_name_len = round_event_name_len(fsn_event);
        inotify_event.len = pad_name_len;
-       inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+       inotify_event.mask = inotify_mask_to_arg(event->mask);
        inotify_event.wd = event->wd;
        inotify_event.cookie = event->sync_cookie;
 
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
                return ERR_PTR(-ENOMEM);
        }
        group->overflow_event = &oevent->fse;
-       fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+       fsnotify_init_event(group->overflow_event, NULL);
+       oevent->mask = FS_Q_OVERFLOW;
        oevent->wd = -1;
        oevent->sync_cookie = 0;
        oevent->name_len = 0;
index d2dd16cb5989144d4289cc879a4a34d2af81c216..d593d42695618f20f585e97ddb835305f7d72e00 100644 (file)
@@ -82,6 +82,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
+#include <linux/ratelimit.h>
 
 #include <linux/atomic.h>
 
@@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 }
 
 static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
-                                              unsigned int type)
+                                              unsigned int type,
+                                              __kernel_fsid_t *fsid)
 {
        struct inode *inode = NULL;
        struct fsnotify_mark_connector *conn;
@@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
        INIT_HLIST_HEAD(&conn->list);
        conn->type = type;
        conn->obj = connp;
+       /* Cache fsid of filesystem containing the object */
+       if (fsid)
+               conn->fsid = *fsid;
+       else
+               conn->fsid.val[0] = conn->fsid.val[1] = 0;
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                inode = igrab(fsnotify_conn_inode(conn));
        /*
@@ -544,7 +551,7 @@ out:
  */
 static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
                                  fsnotify_connp_t *connp, unsigned int type,
-                                 int allow_dups)
+                                 int allow_dups, __kernel_fsid_t *fsid)
 {
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
@@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 
        if (WARN_ON(!fsnotify_valid_obj_type(type)))
                return -EINVAL;
+
+       /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+       if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+               return -ENODEV;
+
 restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
-               err = fsnotify_attach_connector_to_object(connp, type);
+               err = fsnotify_attach_connector_to_object(connp, type, fsid);
                if (err)
                        return err;
                goto restart;
+       } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
+                  (fsid->val[0] != conn->fsid.val[0] ||
+                   fsid->val[1] != conn->fsid.val[1])) {
+               /*
+                * Backend is expected to check for non uniform fsid
+                * (e.g. btrfs), but maybe we missed something?
+                * Only allow setting conn->fsid once to non zero fsid.
+                * inotify and non-fid fanotify groups do not set nor test
+                * conn->fsid.
+                */
+               pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+                                   "%x.%x != %x.%x\n", __func__, conn->type,
+                                   fsid->val[0], fsid->val[1],
+                                   conn->fsid.val[0], conn->fsid.val[1]);
+               err = -EXDEV;
+               goto out_err;
        }
 
        /* is mark the first mark? */
@@ -606,7 +634,7 @@ out_err:
  */
 int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             fsnotify_connp_t *connp, unsigned int type,
-                            int allow_dups)
+                            int allow_dups, __kernel_fsid_t *fsid)
 {
        struct fsnotify_group *group = mark->group;
        int ret = 0;
@@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);
 
-       ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+       ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
        if (ret)
                goto err;
 
@@ -648,13 +676,13 @@ err:
 }
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
-                     unsigned int type, int allow_dups)
+                     unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
 {
        int ret;
        struct fsnotify_group *group = mark->group;
 
        mutex_lock(&group->mark_mutex);
-       ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+       ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
        mutex_unlock(&group->mark_mutex);
        return ret;
 }
index 3c3e36745f591cf8dd6953f71390afeccac58e81..5f3a54d444b50b272d51ec11f9fac0b036bbeb2d 100644 (file)
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
                            struct fsnotify_event *event)
 {
        /* Overflow events are per-group and we don't want to free them */
-       if (!event || event->mask == FS_Q_OVERFLOW)
+       if (!event || event == group->overflow_event)
                return;
        /*
         * If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
        return ret;
 }
 
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+                                 struct fsnotify_event *event)
+{
+       assert_spin_locked(&group->notification_lock);
+       /*
+        * We need to init list head for the case of overflow event so that
+        * check in fsnotify_add_event() works
+        */
+       list_del_init(&event->list);
+       group->q_len--;
+}
+
 /*
  * Remove and return the first event from the notification list.  It is the
  * responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 
        event = list_first_entry(&group->notification_list,
                                 struct fsnotify_event, list);
-       /*
-        * We need to init list head for the case of overflow event so that
-        * check in fsnotify_add_event() works
-        */
-       list_del_init(&event->list);
-       group->q_len--;
-
+       fsnotify_remove_queued_event(group, event);
        return event;
 }
 
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
        }
        spin_unlock(&group->notification_lock);
 }
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- *     parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
-                        u32 mask)
-{
-       INIT_LIST_HEAD(&event->list);
-       event->inode = inode;
-       event->mask = mask;
-}
index 511b279ec69cdce7a3e434d9d9152fd95678bd36..5ab1849971b460bb7d5539f5c7a01a1af1e016c4 100644 (file)
@@ -140,9 +140,13 @@ struct pid_entry {
 #define REG(NAME, MODE, fops)                          \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 #define ONE(NAME, MODE, show)                          \
-       NOD(NAME, (S_IFREG|(MODE)),                     \
+       NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
+#define ATTR(LSM, NAME, MODE)                          \
+       NOD(NAME, (S_IFREG|(MODE)),                     \
+               NULL, &proc_pid_attr_operations,        \
+               { .lsm = LSM })
 
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -1206,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
        .llseek         = default_llseek,
 };
 
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
 #define TMPBUFLEN 11
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
@@ -2521,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
        if (!task)
                return -ESRCH;
 
-       length = security_getprocattr(task,
+       length = security_getprocattr(task, PROC_I(inode)->op.lsm,
                                      (char*)file->f_path.dentry->d_name.name,
                                      &p);
        put_task_struct(task);
@@ -2570,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        if (rv < 0)
                goto out_free;
 
-       rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+       rv = security_setprocattr(PROC_I(inode)->op.lsm,
+                                 file->f_path.dentry->d_name.name, page,
+                                 count);
        mutex_unlock(&current->signal->cred_guard_mutex);
 out_free:
        kfree(page);
@@ -2584,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = {
        .llseek         = generic_file_llseek,
 };
 
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+                            struct dir_context *ctx) \
+{ \
+       return proc_pident_readdir(filp, ctx, \
+                                  LSM##_attr_dir_stuff, \
+                                  ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+       .read           = generic_read_dir, \
+       .iterate        = proc_##LSM##_attr_dir_iterate, \
+       .llseek         = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+                               struct dentry *dentry, unsigned int flags) \
+{ \
+       return proc_pident_lookup(dir, dentry, \
+                                 LSM##_attr_dir_stuff, \
+                                 ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+       .lookup         = proc_##LSM##_attr_dir_lookup, \
+       .getattr        = pid_getattr, \
+       .setattr        = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+       ATTR("smack", "current",        0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
 static const struct pid_entry attr_dir_stuff[] = {
-       REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-       REG("prev",       S_IRUGO,         proc_pid_attr_operations),
-       REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-       REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-       REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-       REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+       ATTR(NULL, "current",           0666),
+       ATTR(NULL, "prev",              0444),
+       ATTR(NULL, "exec",              0666),
+       ATTR(NULL, "fscreate",          0666),
+       ATTR(NULL, "keycreate",         0666),
+       ATTR(NULL, "sockcreate",        0666),
+#ifdef CONFIG_SECURITY_SMACK
+       DIR("smack",                    0555,
+           proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
 };
 
 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2998,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
@@ -3386,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = {
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
index 4fc5a9b68f76d8d1f9498aacef8920eb42ffc9b3..ea575375f210d393d43e94ff63ed0394285a7634 100644 (file)
@@ -82,6 +82,7 @@ union proc_op {
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
+       const char *lsm;
 };
 
 struct proc_inode {
index f0216629621d6b52f21f19f9b4010181cfe45d4f..eea7af6f2f229e85f71e3ff3ac5d541e88956856 100644 (file)
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
        return retval;
 }
 
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+       struct kstatfs st;
+       int error;
+
+       error = statfs_by_dentry(dentry, &st);
+       if (error)
+               return error;
+
+       *fsid = st.f_fsid;
+       return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
 int vfs_statfs(const struct path *path, struct kstatfs *buf)
 {
        int error;
index e3d684ea320303630f4a75135ed4e4bcc408484c..ffd8038ff7283fc0d83211a1f5653a1589f10c4b 100644 (file)
@@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
        if (lvd->integritySeqExt.extLength)
                udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
        ret = 0;
+
+       if (!sbi->s_lvid_bh) {
+               /* We can't generate unique IDs without a valid LVID */
+               if (sb_rdonly(sb)) {
+                       UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+               } else {
+                       udf_warn(sb, "Damaged or missing LVID, forcing "
+                                    "readonly mount\n");
+                       ret = -EACCES;
+               }
+       }
 out_bh:
        brelse(bh);
        return ret;
@@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
        return 0;
 }
 
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+       struct timespec64 ts;
+
+       ktime_get_real_ts64(&ts);
+       udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+       lvid->descTag.descCRC = cpu_to_le16(
+               crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+                       le16_to_cpu(lvid->descTag.descCRCLength)));
+       lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
 static void udf_open_lvid(struct super_block *sb)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;
-       struct timespec64 ts;
 
        if (!bh)
                return;
@@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
        mutex_lock(&sbi->s_alloc_mutex);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-       ktime_get_real_ts64(&ts);
-       udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
        if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
                lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
        else
                UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
 
-       lvid->descTag.descCRC = cpu_to_le16(
-               crc_itu_t(0, (char *)lvid + sizeof(struct tag),
-                       le16_to_cpu(lvid->descTag.descCRCLength)));
-
-       lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+       udf_finalize_lvid(lvid);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
        mutex_unlock(&sbi->s_alloc_mutex);
@@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;
-       struct timespec64 ts;
 
        if (!bh)
                return;
@@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
        mutex_lock(&sbi->s_alloc_mutex);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-       ktime_get_real_ts64(&ts);
-       udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
        if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
                lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
        if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
                lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
 
-       lvid->descTag.descCRC = cpu_to_le16(
-                       crc_itu_t(0, (char *)lvid + sizeof(struct tag),
-                               le16_to_cpu(lvid->descTag.descCRCLength)));
-
-       lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        /*
         * We set buffer uptodate unconditionally here to avoid spurious
         * warnings from mark_buffer_dirty() when previous EIO has marked
         * the buffer as !uptodate
         */
        set_buffer_uptodate(bh);
+       udf_finalize_lvid(lvid);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
        mutex_unlock(&sbi->s_alloc_mutex);
@@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
        if (!(++uniqueID & 0xFFFFFFFF))
                uniqueID += 16;
        lvhd->uniqueID = cpu_to_le64(uniqueID);
+       udf_updated_lvid(sb);
        mutex_unlock(&sbi->s_alloc_mutex);
-       mark_buffer_dirty(bh);
 
        return ret;
 }
@@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
 
        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_dirty) {
+               struct buffer_head *bh = sbi->s_lvid_bh;
+               struct logicalVolIntegrityDesc *lvid;
+
+               lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+               udf_finalize_lvid(lvid);
+
                /*
                 * Blockdevice will be synced later so we don't have to submit
                 * the buffer for IO
                 */
-               mark_buffer_dirty(sbi->s_lvid_bh);
+               mark_buffer_dirty(bh);
                sbi->s_lvid_dirty = 0;
        }
        mutex_unlock(&sbi->s_alloc_mutex);
index 999ad8d00d433b278554e7df259c076dec95f14e..1ef8acf35e7d8f33aea4ed771dd8e9aaf23eedd8 100644 (file)
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
        { /* BNO root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_allocbt_buf_ops,
+               .ops = &xfs_bnobt_buf_ops,
                .work = &xfs_bnoroot_init,
                .need_init = true
        },
        { /* CNT root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_allocbt_buf_ops,
+               .ops = &xfs_cntbt_buf_ops,
                .work = &xfs_cntroot_init,
                .need_init = true
        },
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
        { /* FINO root block */
                .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
                .numblks = BTOBB(mp->m_sb.sb_blocksize),
-               .ops = &xfs_inobt_buf_ops,
+               .ops = &xfs_finobt_buf_ops,
                .work = &xfs_btroot_init,
                .type = XFS_BTNUM_FINO,
                .need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
index e701ebc36c069f5696c5b6287474bf37fad4b05c..e2ba2a3b63b20a6378283e35e1c58c939f1d2476 100644 (file)
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
                         */
                        ask = used = 0;
 
-                       mp->m_inotbt_nores = true;
+                       mp->m_finobt_nores = true;
 
                        error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
                                        &used);
index b715668886a4824adf8a1c4527236469afed89fe..bc3367b8b7bb0375d38a9462800916f083a5ff10 100644 (file)
@@ -568,9 +568,9 @@ xfs_agfl_verify(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return NULL;
 
-       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+       if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
                return __this_address;
-       if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
        /*
         * during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
 
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
        .name = "xfs_agfl",
+       .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
        .verify_read = xfs_agfl_read_verify,
        .verify_write = xfs_agfl_write_verify,
        .verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
                        return __this_address;
        }
 
-       if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-             XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+       if (!xfs_verify_magic(bp, agf->agf_magicnum))
+               return __this_address;
+
+       if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
              be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
              be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
              be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
 
 const struct xfs_buf_ops xfs_agf_buf_ops = {
        .name = "xfs_agf",
+       .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
        .verify_read = xfs_agf_read_verify,
        .verify_write = xfs_agf_write_verify,
        .verify_struct = xfs_agf_verify,
index 4e59cc8a280221973279f262e2ee613ba61a5b3d..9fe949f6055ec32e89e08d3cf01608cdbb678f42 100644 (file)
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
        struct xfs_perag        *pag = bp->b_pag;
        xfs_failaddr_t          fa;
        unsigned int            level;
+       xfs_btnum_t             btnum = XFS_BTNUM_BNOi;
+
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               fa = xfs_btree_sblock_v5hdr_verify(bp);
+               if (fa)
+                       return fa;
+       }
 
        /*
-        * magic number and level verification
-        *
-        * During growfs operations, we can't verify the exact level or owner as
-        * the perag is not fully initialised and hence not attached to the
-        * buffer.  In this case, check against the maximum tree depth.
+        * The perag may not be attached during grow operations or fully
+        * initialized from the AGF during log recovery. Therefore we can only
+        * check against maximum tree depth from those contexts.
         *
-        * Similarly, during log recovery we will have a perag structure
-        * attached, but the agf information will not yet have been initialised
-        * from the on disk AGF. Again, we can only check against maximum limits
-        * in this case.
+        * Otherwise check against the per-tree limit. Peek at one of the
+        * verifier magic values to determine the type of tree we're verifying
+        * against.
         */
        level = be16_to_cpu(block->bb_level);
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-               fa = xfs_btree_sblock_v5hdr_verify(bp);
-               if (fa)
-                       return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTB_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
-                               return __this_address;
-               } else if (level >= mp->m_ag_maxlevels)
+       if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+               btnum = XFS_BTNUM_CNTi;
+       if (pag && pag->pagf_init) {
+               if (level >= pag->pagf_levels[btnum])
                        return __this_address;
-               break;
-       case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-               fa = xfs_btree_sblock_v5hdr_verify(bp);
-               if (fa)
-                       return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_ABTC_MAGIC):
-               if (pag && pag->pagf_init) {
-                       if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
-                               return __this_address;
-               } else if (level >= mp->m_ag_maxlevels)
-                       return __this_address;
-               break;
-       default:
+       } else if (level >= mp->m_ag_maxlevels)
                return __this_address;
-       }
 
        return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
 }
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
 
 }
 
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
-       .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+       .name = "xfs_bnobt",
+       .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+                  cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
        .verify_read = xfs_allocbt_read_verify,
        .verify_write = xfs_allocbt_write_verify,
        .verify_struct = xfs_allocbt_verify,
 };
 
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+       .name = "xfs_cntbt",
+       .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+                  cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+       .verify_read = xfs_allocbt_read_verify,
+       .verify_write = xfs_allocbt_write_verify,
+       .verify_struct = xfs_allocbt_verify,
+};
 
 STATIC int
 xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_bnobt_key_diff,
-       .buf_ops                = &xfs_allocbt_buf_ops,
+       .buf_ops                = &xfs_bnobt_buf_ops,
        .diff_two_keys          = xfs_bnobt_diff_two_keys,
        .keys_inorder           = xfs_bnobt_keys_inorder,
        .recs_inorder           = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_cntbt_key_diff,
-       .buf_ops                = &xfs_allocbt_buf_ops,
+       .buf_ops                = &xfs_cntbt_buf_ops,
        .diff_two_keys          = xfs_cntbt_diff_two_keys,
        .keys_inorder           = xfs_cntbt_keys_inorder,
        .recs_inorder           = xfs_cntbt_recs_inorder,
index 844ed87b190077115c760204659179bca1da8c43..2dd9ee2a2e08f2b99fedf66075fe44d65e3d16d2 100644 (file)
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
        xfs_da_state_free(state);
        return retval;
 }
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+       const void      *name,
+       size_t          length)
+{
+       /*
+        * MAXNAMELEN includes the trailing null, but (name/length) leave it
+        * out, so use >= for the length check.
+        */
+       if (length >= MAXNAMELEN)
+               return false;
+
+       /* There shouldn't be any nulls here */
+       return !memchr(name, 0, length);
+}
index bdf52a333f3f9a2d7d4492d54b0ecf9a919402a5..2297d84676669ff25aaad45f86686e6671a1d182 100644 (file)
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_remove_args(struct xfs_da_args *args);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                  int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
 
 #endif /* __XFS_ATTR_H__ */
index 2652d00842d6ba8c6479f816765c87dfc622d1cb..1f6e3965ff7425456ca64477a713573cb5e7943a 100644 (file)
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
        struct xfs_attr_leaf_entry      *entries;
        uint32_t                        end;    /* must be 32bit - see below */
        int                             i;
+       xfs_failaddr_t                  fa;
 
        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
-                       return __this_address;
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
-                       return __this_address;
-       }
        /*
         * In recovery there is a transient state where count == 0 is valid
         * because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
 
 const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
        .name = "xfs_attr3_leaf",
+       .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+                    cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
        .verify_read = xfs_attr3_leaf_read_verify,
        .verify_write = xfs_attr3_leaf_write_verify,
        .verify_struct = xfs_attr3_leaf_verify,
index d89363c6b5234d73cef58d4e9533a88f6de09c46..65ff600a8067875f3d898481e1ef2c271d55bdd9 100644 (file)
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
 static xfs_failaddr_t
 xfs_attr3_rmt_verify(
        struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
        void                    *ptr,
        int                     fsbsize,
        xfs_daddr_t             bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
 
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return __this_address;
-       if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+       if (!xfs_verify_magic(bp, rmt->rm_magic))
                return __this_address;
        if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
                        *failaddr = __this_address;
                        return -EFSBADCRC;
                }
-               *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+               *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
                if (*failaddr)
                        return -EFSCORRUPTED;
                len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
        while (len > 0) {
                struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
 
-               fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+               fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
                if (fa) {
                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
                        return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
 
 const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
        .name = "xfs_attr3_rmt",
+       .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
        .verify_read = xfs_attr3_rmt_read_verify,
        .verify_write = xfs_attr3_rmt_write_verify,
        .verify_struct = xfs_attr3_rmt_verify_struct,
index 332eefa2700ba7c86e480533b4d0de45339f26f8..48502cb9990f184a55b780372adaef3bda406509 100644 (file)
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
  */
 
 /*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
  */
 STATIC int                             /* error */
 xfs_bmap_btree_to_extents(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_inode_t             *ip,    /* incore inode pointer */
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       struct xfs_inode        *ip,    /* incore inode pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     *logflagsp, /* inode logging flags */
        int                     whichfork)  /* data or attr fork */
 {
-       /* REFERENCED */
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_btree_block  *rblock = ifp->if_broot;
        struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
-       struct xfs_ifork        *ifp;   /* inode fork data */
-       xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-       struct xfs_btree_block  *rblock;/* root btree block */
        struct xfs_owner_info   oinfo;
 
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
+       /* check if we actually need the extent format first: */
+       if (!xfs_bmap_wants_extents(ip, whichfork))
+               return 0;
+
+       ASSERT(cur);
        ASSERT(whichfork != XFS_COW_FORK);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-       rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
-       *logflagsp = 0;
 #ifdef DEBUG
        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                        xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
        ASSERT(ifp->if_broot == NULL);
        ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-       *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+       *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
        return 0;
 }
 
@@ -2029,7 +2031,7 @@ done:
 /*
  * Convert an unwritten allocation to a real allocation or vice versa.
  */
-STATIC int                             /* error */
+int                                    /* error */
 xfs_bmap_add_extent_unwritten_real(
        struct xfs_trans        *tp,
        xfs_inode_t             *ip,    /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
        }
 }
 
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
-       struct xfs_bmbt_irec    *irec,
-       struct xfs_inode        *ip)
-
-{
-       xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
-                                             i_size_read(VFS_I(ip))));
-}
-
 /*
  * Trim the returned map to the required bounds
  */
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
        return 0;
 }
 
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     fork)
+{
+       if (tp && tp->t_firstblock != NULLFSBLOCK)
+               return 0;
+       if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+               return 1;
+       return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error.  Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should.  Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+       struct xfs_bmalloca     *bma,
+       int                     whichfork,
+       int                     error)
+{
+       if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+           XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               bma->logflags &= ~xfs_ilog_fext(whichfork);
+       else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+                XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+               bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+       if (bma->logflags)
+               xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+       if (bma->cur)
+               xfs_btree_del_cursor(bma->cur, error);
+}
+
 /*
  * Map file blocks to filesystem blocks, and allocate blocks or convert the
  * extent state if necessary.  Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
 
        ASSERT(*nmap >= 1);
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-       ASSERT(tp != NULL ||
-              (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
-                       (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+       ASSERT(tp != NULL);
        ASSERT(len > 0);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
 
        XFS_STATS_INC(mp, xs_blk_mapw);
 
-       if (!tp || tp->t_firstblock == NULLFSBLOCK) {
-               if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-                       bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-               else
-                       bma.minleft = 1;
-       } else {
-               bma.minleft = 0;
-       }
-
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(tp, ip, whichfork);
                if (error)
                        goto error0;
        }
 
-       n = 0;
-       end = bno + len;
-       obno = bno;
-
        if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
                eof = true;
        if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
        bma.ip = ip;
        bma.total = total;
        bma.datatype = 0;
+       bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
 
+       n = 0;
+       end = bno + len;
+       obno = bno;
        while (bno < end && n < *nmap) {
                bool                    need_alloc = false, wasdelay = false;
 
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
                        ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
                                 (flags & XFS_BMAPI_COWFORK)));
 
-                       if (flags & XFS_BMAPI_DELALLOC) {
-                               /*
-                                * For the COW fork we can reasonably get a
-                                * request for converting an extent that races
-                                * with other threads already having converted
-                                * part of it, as there converting COW to
-                                * regular blocks is not protected using the
-                                * IOLOCK.
-                                */
-                               ASSERT(flags & XFS_BMAPI_COWFORK);
-                               if (!(flags & XFS_BMAPI_COWFORK)) {
-                                       error = -EIO;
-                                       goto error0;
-                               }
-
-                               if (eof || bno >= end)
-                                       break;
-                       } else {
-                               need_alloc = true;
-                       }
+                       need_alloc = true;
                } else if (isnullstartblock(bma.got.br_startblock)) {
                        wasdelay = true;
                }
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
                 * First, deal with the hole before the allocated space
                 * that we found, if any.
                 */
-               if ((need_alloc || wasdelay) &&
-                   !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+               if (need_alloc || wasdelay) {
                        bma.eof = eof;
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
        }
        *nmap = n;
 
-       /*
-        * Transform from btree to extents, give it cur.
-        */
-       if (xfs_bmap_wants_extents(ip, whichfork)) {
-               int             tmp_logflags = 0;
-
-               ASSERT(bma.cur);
-               error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
-                       &tmp_logflags, whichfork);
-               bma.logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
+       error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+                       whichfork);
+       if (error)
+               goto error0;
 
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
               XFS_IFORK_NEXTENTS(ip, whichfork) >
                XFS_IFORK_MAXEXT(ip, whichfork));
-       error = 0;
+       xfs_bmapi_finish(&bma, whichfork, 0);
+       xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+               orig_nmap, *nmap);
+       return 0;
 error0:
+       xfs_bmapi_finish(&bma, whichfork, error);
+       return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       xfs_fileoff_t           offset_fsb,
+       struct xfs_bmbt_irec    *imap,
+       unsigned int            *seq)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmalloca     bma = { NULL };
+       struct xfs_trans        *tp;
+       int                     error;
+
        /*
-        * Log everything.  Do this after conversion, there's no point in
-        * logging the extent records if we've converted to btree format.
+        * Space for the extent and indirect blocks was reserved when the
+        * delalloc extent was created so there's no need to do so here.
         */
-       if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
-           XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               bma.logflags &= ~xfs_ilog_fext(whichfork);
-       else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
-                XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-               bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+                               XFS_TRANS_RESERVE, &tp);
+       if (error)
+               return error;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+           bma.got.br_startoff > offset_fsb) {
+               /*
+                * No extent found in the range we are trying to convert.  This
+                * should only happen for the COW fork, where another thread
+                * might have moved the extent to the data fork in the meantime.
+                */
+               WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+               error = -EAGAIN;
+               goto out_trans_cancel;
+       }
+
        /*
-        * Log whatever the flags say, even if error.  Otherwise we might miss
-        * detecting a case where the data is changed, there's an error,
-        * and it's not logged so we don't shutdown when we should.
+        * If we find a real extent here we raced with another thread converting
+        * the extent.  Just return the real extent at this offset.
         */
-       if (bma.logflags)
-               xfs_trans_log_inode(tp, ip, bma.logflags);
+       if (!isnullstartblock(bma.got.br_startblock)) {
+               *imap = bma.got;
+               *seq = READ_ONCE(ifp->if_seq);
+               goto out_trans_cancel;
+       }
+
+       bma.tp = tp;
+       bma.ip = ip;
+       bma.wasdel = true;
+       bma.offset = bma.got.br_startoff;
+       bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+       bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+       bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+       if (whichfork == XFS_COW_FORK)
+               bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
 
-       if (bma.cur) {
-               xfs_btree_del_cursor(bma.cur, error);
+       if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+               bma.prev.br_startoff = NULLFILEOFF;
+
+       error = xfs_bmapi_allocate(&bma);
+       if (error)
+               goto out_finish;
+
+       error = -ENOSPC;
+       if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+               goto out_finish;
+       error = -EFSCORRUPTED;
+       if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+               goto out_finish;
+
+       XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+       XFS_STATS_INC(mp, xs_xstrat_quick);
+
+       ASSERT(!isnullstartblock(bma.got.br_startblock));
+       *imap = bma.got;
+       *seq = READ_ONCE(ifp->if_seq);
+
+       if (whichfork == XFS_COW_FORK) {
+               error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+                               bma.length);
+               if (error)
+                       goto out_finish;
        }
-       if (!error)
-               xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
-                       orig_nmap, *nmap);
+
+       error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+                       whichfork);
+       if (error)
+               goto out_finish;
+
+       xfs_bmapi_finish(&bma, whichfork, 0);
+       error = xfs_trans_commit(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+
+out_finish:
+       xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+       xfs_trans_cancel(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
 
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
        if (error)
                goto error0;
 
-       if (xfs_bmap_wants_extents(ip, whichfork)) {
-               int             tmp_logflags = 0;
-
-               error = xfs_bmap_btree_to_extents(tp, ip, cur,
-                       &tmp_logflags, whichfork);
-               logflags |= tmp_logflags;
-       }
+       error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
 
 error0:
        if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
                error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
-       }
-       /*
-        * transform from btree to extents, give it cur
-        */
-       else if (xfs_bmap_wants_extents(ip, whichfork)) {
-               ASSERT(cur != NULL);
-               error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+       } else {
+               error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
                        whichfork);
-               logflags |= tmp_logflags;
-               if (error)
-                       goto error0;
        }
-       /*
-        * transform from extents to local?
-        */
-       error = 0;
+
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
index 09d3ea97cc15a207cd9b6445ddab1372fc012146..8f597f9abdbe952e8e92fb4fc12f568fa9886cba 100644 (file)
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK      0x200
 
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC     0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
 /* Skip online discard of freed extents */
 #define XFS_BMAPI_NODISCARD    0x1000
 
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
        { XFS_BMAPI_ZERO,       "ZERO" }, \
        { XFS_BMAPI_REMAP,      "REMAP" }, \
        { XFS_BMAPI_COWFORK,    "COWFORK" }, \
-       { XFS_BMAPI_DELALLOC,   "DELALLOC" }, \
-       { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
        { XFS_BMAPI_NODISCARD,  "NODISCARD" }, \
        { XFS_BMAPI_NORMAP,     "NORMAP" }
 
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
 
 void   xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
-void   xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 int    xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
 void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int        xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
                xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
                struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
                int eof);
+int    xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+               xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+               unsigned int *seq);
+int    xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+               struct xfs_bmbt_irec *new, int *logflagsp);
 
 static inline void
 xfs_bmap_add_free(
index cdb74d2e2a435bc446778dc2e1c9f7a4d7318506..aff82ed112c93c26f43bed5ada5fd4b82e4e3711 100644 (file)
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
                /*
                 * XXX: need a better way of verifying the owner here. Right now
                 * just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
                fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
                if (fa)
                        return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_BMAP_MAGIC):
-               break;
-       default:
-               return __this_address;
        }
 
        /*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
        .name = "xfs_bmbt",
+       .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+                  cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
        .verify_read = xfs_bmbt_read_verify,
        .verify_write = xfs_bmbt_write_verify,
        .verify_struct = xfs_bmbt_verify,
index 376bee94b5dd0b92ecc94cab2a92247a08adc88e..e2737e2ac2aeb5e31a997ee3ed5f3800bf5ecfa7 100644 (file)
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
        kmem_zone_free(xfs_da_state_zone, state);
 }
 
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+       struct xfs_buf          *bp,
+       struct xfs_da3_blkinfo  *hdr3)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_da_blkinfo   *hdr = &hdr3->hdr;
+
+       if (!xfs_verify_magic16(bp, hdr->magic))
+               return __this_address;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+                       return __this_address;
+               if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                       return __this_address;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+                       return __this_address;
+       }
+
+       return NULL;
+}
+
 static xfs_failaddr_t
 xfs_da3_node_verify(
        struct xfs_buf          *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
        struct xfs_da_intnode   *hdr = bp->b_addr;
        struct xfs_da3_icnode_hdr ichdr;
        const struct xfs_dir_ops *ops;
+       xfs_failaddr_t          fa;
 
        ops = xfs_dir_get_ops(mp, NULL);
 
        ops->node_hdr_from_disk(&ichdr, hdr);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
-               if (ichdr.magic != XFS_DA3_NODE_MAGIC)
-                       return __this_address;
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (ichdr.magic != XFS_DA_NODE_MAGIC)
-                       return __this_address;
-       }
        if (ichdr.level == 0)
                return __this_address;
        if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
 
 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
        .name = "xfs_da3_node",
+       .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+                    cpu_to_be16(XFS_DA3_NODE_MAGIC) },
        .verify_read = xfs_da3_node_read_verify,
        .verify_write = xfs_da3_node_write_verify,
        .verify_struct = xfs_da3_node_verify_struct,
index 5d5bf3bffc783a1f3711cdf9edca5c2e4ccc2e4b..ae654e06b2fb693627311c8e59235d1f04e79773 100644 (file)
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
        return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
 }
 
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+                                     struct xfs_da3_blkinfo *hdr3);
+
 #endif /* __XFS_DA_FORMAT_H__ */
index 229152cd1a246f34bf9237572a4680bd13093ab7..156ce95c9c4545de6b03cd638463e23f7fa4746e 100644 (file)
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        return 0;
 }
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+       const void      *name,
+       size_t          length)
+{
+       /*
+        * MAXNAMELEN includes the trailing null, but (name/length) leave it
+        * out, so use >= for the length check.
+        */
+       if (length >= MAXNAMELEN)
+               return false;
+
+       /* There shouldn't be any slashes or nulls here */
+       return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
index c3e3f6b813d869cb2f7a78bebc3b2a5765ab58ed..f542447794928e47c9eba683690866e105506aaa 100644 (file)
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
 unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
 void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
                struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
 
 #endif /* __XFS_DIR2_H__ */
index 30ed5919da7235e8885afbe15adacffad9949a79..b7d6d78f4ce2f3ef263fd54d8523702045dc5c40 100644 (file)
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr3->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
-                       return __this_address;
        }
        return __xfs_dir3_data_check(NULL, bp);
 }
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
        .name = "xfs_dir3_block",
+       .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+                  cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
        .verify_read = xfs_dir3_block_read_verify,
        .verify_write = xfs_dir3_block_write_verify,
        .verify_struct = xfs_dir3_block_verify,
index 01162c62ec8f8fe49ac05c68a9b8925b2b693946..b7b9ce002cb97838d2413ad579d499c582fda3d6 100644 (file)
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr3->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
-                       return __this_address;
        }
        return __xfs_dir3_data_check(NULL, bp);
 }
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
        .name = "xfs_dir3_data",
+       .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+                  cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
        .verify_read = xfs_dir3_data_read_verify,
        .verify_write = xfs_dir3_data_write_verify,
        .verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
 
 static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
        .name = "xfs_dir3_data_reada",
+       .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+                  cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
        .verify_read = xfs_dir3_data_reada_verify,
        .verify_write = xfs_dir3_data_write_verify,
 };
index 1728a3e6f5cf7381460ffce134f166661e164dc3..9a3767818c507b61377434b971c7309b9e768050 100644 (file)
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
  */
 static xfs_failaddr_t
 xfs_dir3_leaf_verify(
-       struct xfs_buf          *bp,
-       uint16_t                magic)
+       struct xfs_buf          *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+       xfs_failaddr_t          fa;
 
-       ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
-       if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
-               uint16_t                magic3;
-
-               magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
-                                                        : XFS_DIR3_LEAFN_MAGIC;
-
-               if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
-                       return __this_address;
-               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
-                       return __this_address;
-               if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
-                       return __this_address;
-               if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
-                       return __this_address;
-       } else {
-               if (leaf->hdr.info.magic != cpu_to_be16(magic))
-                       return __this_address;
-       }
+       fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+       if (fa)
+               return fa;
 
        return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
 }
 
 static void
-__read_verify(
-       struct xfs_buf  *bp,
-       uint16_t        magic)
+xfs_dir3_leaf_read_verify(
+       struct xfs_buf  *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        xfs_failaddr_t          fa;
@@ -185,23 +166,22 @@ __read_verify(
             !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
        else {
-               fa = xfs_dir3_leaf_verify(bp, magic);
+               fa = xfs_dir3_leaf_verify(bp);
                if (fa)
                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
        }
 }
 
 static void
-__write_verify(
-       struct xfs_buf  *bp,
-       uint16_t        magic)
+xfs_dir3_leaf_write_verify(
+       struct xfs_buf  *bp)
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_buf_log_item *bip = bp->b_log_item;
        struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
        xfs_failaddr_t          fa;
 
-       fa = xfs_dir3_leaf_verify(bp, magic);
+       fa = xfs_dir3_leaf_verify(bp);
        if (fa) {
                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
                return;
@@ -216,60 +196,22 @@ __write_verify(
        xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
 }
 
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
-       struct xfs_buf  *bp)
-{
-       return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
-       struct xfs_buf  *bp)
-{
-       return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
-       struct xfs_buf  *bp)
-{
-       __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
-       struct xfs_buf  *bp)
-{
-       __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
 const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
        .name = "xfs_dir3_leaf1",
-       .verify_read = xfs_dir3_leaf1_read_verify,
-       .verify_write = xfs_dir3_leaf1_write_verify,
-       .verify_struct = xfs_dir3_leaf1_verify,
+       .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+                    cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+       .verify_read = xfs_dir3_leaf_read_verify,
+       .verify_write = xfs_dir3_leaf_write_verify,
+       .verify_struct = xfs_dir3_leaf_verify,
 };
 
 const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
        .name = "xfs_dir3_leafn",
-       .verify_read = xfs_dir3_leafn_read_verify,
-       .verify_write = xfs_dir3_leafn_write_verify,
-       .verify_struct = xfs_dir3_leafn_verify,
+       .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+                    cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+       .verify_read = xfs_dir3_leaf_read_verify,
+       .verify_write = xfs_dir3_leaf_write_verify,
+       .verify_struct = xfs_dir3_leaf_verify,
 };
 
 int
index f1bb3434f51c79d17fbc951b6e108d8c33b6865e..3b03703c5c3dbb3e3058356f76b0220bd37f300e 100644 (file)
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir2_free_hdr *hdr = bp->b_addr;
 
+       if (!xfs_verify_magic(bp, hdr->magic))
+               return __this_address;
+
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
 
-               if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
-                       return __this_address;
                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return __this_address;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return __this_address;
                if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
                        return __this_address;
-       } else {
-               if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
-                       return __this_address;
        }
 
        /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
 
 const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
        .name = "xfs_dir3_free",
+       .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+                  cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
        .verify_read = xfs_dir3_free_read_verify,
        .verify_write = xfs_dir3_free_write_verify,
        .verify_struct = xfs_dir3_free_verify,
index d293f371dd54bc70583407a3d7638c64efbe586c..fb5bd9a804f6a863452a47e8cc2f3be50d5378ef 100644 (file)
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
        .name = "xfs_dquot",
+       .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+                    cpu_to_be16(XFS_DQUOT_MAGIC) },
        .verify_read = xfs_dquot_buf_read_verify,
        .verify_write = xfs_dquot_buf_write_verify,
        .verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
 
 const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
        .name = "xfs_dquot_ra",
+       .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+                    cpu_to_be16(XFS_DQUOT_MAGIC) },
        .verify_read = xfs_dquot_buf_readahead_verify,
        .verify_write = xfs_dquot_buf_write_verify,
 };
index 66077a105cbb7408131a6b9d5580e6a0c12599ca..79e6c4fb1d8a8440ae744c2bc032658da0bb5da0 100644 (file)
@@ -54,7 +54,8 @@
 #define XFS_ERRTAG_BUF_LRU_REF                         31
 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR                  32
 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC                        33
-#define XFS_ERRTAG_MAX                                 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK                    34
+#define XFS_ERRTAG_MAX                                 35
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
 #define XFS_RANDOM_BUF_LRU_REF                         2
 #define XFS_RANDOM_FORCE_SCRUB_REPAIR                  1
 #define XFS_RANDOM_FORCE_SUMMARY_RECALC                        1
+#define XFS_RANDOM_IUNLINK_FALLBACK                    (XFS_RANDOM_DEFAULT/10)
 
 #endif /* __XFS_ERRORTAG_H_ */
index d32152fc8a6c56bae799c8999e1862210659cd0b..fe9898875097f5cd8506f9664f636a393cce2e2e 100644 (file)
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
        /*
         * Validate the magic number of the agi block.
         */
-       if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+       if (!xfs_verify_magic(bp, agi->agi_magicnum))
                return __this_address;
        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
                return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
 
 const struct xfs_buf_ops xfs_agi_buf_ops = {
        .name = "xfs_agi",
+       .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
        .verify_read = xfs_agi_read_verify,
        .verify_write = xfs_agi_write_verify,
        .verify_struct = xfs_agi_verify,
index 9b25e7a0df470b6e5552d7e841a74082681a8cf8..1080381ff243e68ee3b6c89fef1e2e78341d8994 100644 (file)
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
        union xfs_btree_ptr     *new,
        int                     *stat)
 {
-       if (cur->bc_mp->m_inotbt_nores)
+       if (cur->bc_mp->m_finobt_nores)
                return xfs_inobt_alloc_block(cur, start, new, stat);
        return __xfs_inobt_alloc_block(cur, start, new, stat,
                        XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
        struct xfs_btree_cur    *cur,
        struct xfs_buf          *bp)
 {
-       if (cur->bc_mp->m_inotbt_nores)
+       if (cur->bc_mp->m_finobt_nores)
                return xfs_inobt_free_block(cur, bp);
        return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
 }
@@ -260,6 +260,9 @@ xfs_inobt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
+       if (!xfs_verify_magic(bp, block->bb_magic))
+               return __this_address;
+
        /*
         * During growfs operations, we can't verify the exact owner as the
         * perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
         * but beware of the landmine (i.e. need to check pag->pagi_init) if we
         * ever do.
         */
-       switch (block->bb_magic) {
-       case cpu_to_be32(XFS_IBT_CRC_MAGIC):
-       case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
                fa = xfs_btree_sblock_v5hdr_verify(bp);
                if (fa)
                        return fa;
-               /* fall through */
-       case cpu_to_be32(XFS_IBT_MAGIC):
-       case cpu_to_be32(XFS_FIBT_MAGIC):
-               break;
-       default:
-               return __this_address;
        }
 
        /* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
 
 const struct xfs_buf_ops xfs_inobt_buf_ops = {
        .name = "xfs_inobt",
+       .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+       .verify_read = xfs_inobt_read_verify,
+       .verify_write = xfs_inobt_write_verify,
+       .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+       .name = "xfs_finobt",
+       .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+                  cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
        .verify_read = xfs_inobt_read_verify,
        .verify_write = xfs_inobt_write_verify,
        .verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
-       .buf_ops                = &xfs_inobt_buf_ops,
+       .buf_ops                = &xfs_finobt_buf_ops,
        .diff_two_keys          = xfs_inobt_diff_two_keys,
        .keys_inorder           = xfs_inobt_keys_inorder,
        .recs_inorder           = xfs_inobt_recs_inorder,
index 771dd072015d50bd68901f8ac5a13d847534c5d2..bc690f2409faab3135fc1cf857263fab99faf2a8 100644 (file)
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
 }
 
 /*
- * Increment the sequence counter if we are on a COW fork.  This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed.  We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
  */
 static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
 {
-       if (state & BMAP_COWFORK)
-               WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+       WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
 
 void
index 09d9c8cfa4a09f933a55f1122879809ecb3010af..e021d5133ccb42d7b51f916180420bb41421aa09 100644 (file)
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
 
                dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
                unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+               di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
                        xfs_dinode_good_version(mp, dip->di_version) &&
-                       (unlinked_ino == NULLAGINO ||
-                        xfs_verify_agino(mp, agno, unlinked_ino));
+                       xfs_verify_agino_or_null(mp, agno, unlinked_ino);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP))) {
                        if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
 
 const struct xfs_buf_ops xfs_inode_buf_ops = {
        .name = "xfs_inode",
+       .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+                    cpu_to_be16(XFS_DINODE_MAGIC) },
        .verify_read = xfs_inode_buf_read_verify,
        .verify_write = xfs_inode_buf_write_verify,
 };
 
 const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
-       .name = "xxfs_inode_ra",
+       .name = "xfs_inode_ra",
+       .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+                    cpu_to_be16(XFS_DINODE_MAGIC) },
        .verify_read = xfs_inode_buf_readahead_verify,
        .verify_write = xfs_inode_buf_write_verify,
 };
index 60361d2d74a182808333aa65be8c05fb9f817bb5..00c62ce170d0eb55db2e5046f89d842beee05bf2 100644 (file)
@@ -14,7 +14,7 @@ struct xfs_dinode;
  */
 struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
-       unsigned int            if_seq;         /* cow fork mod counter */
+       unsigned int            if_seq;         /* fork mod counter */
        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
index d9eab657b63e4212ee6e731771b32244f65d7876..6f47ab876d90f229713d2cc8b5c8fc54aee06c5b 100644 (file)
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
        xfs_failaddr_t          fa;
        unsigned int            level;
 
-       if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+       if (!xfs_verify_magic(bp, block->bb_magic))
                return __this_address;
 
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
 
 const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
        .name                   = "xfs_refcountbt",
+       .magic                  = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
        .verify_read            = xfs_refcountbt_read_verify,
        .verify_write           = xfs_refcountbt_write_verify,
        .verify_struct          = xfs_refcountbt_verify,
index f79cf040d7450fe0c61a3f27bedbe89f4660d5cb..5738e11055e6bbeaf9602b353555450b24908600 100644 (file)
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
         * from the on disk AGF. Again, we can only check against maximum limits
         * in this case.
         */
-       if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+       if (!xfs_verify_magic(bp, block->bb_magic))
                return __this_address;
 
        if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
 
 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
        .name                   = "xfs_rmapbt",
+       .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
        .verify_read            = xfs_rmapbt_read_verify,
        .verify_write           = xfs_rmapbt_write_verify,
        .verify_struct          = xfs_rmapbt_verify,
index b5a82acd7dfe01d9225c345bbd15740fb4995e83..77a3a4085de3b7e56d3747b6b8f64df0fb13dba2 100644 (file)
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
        struct xfs_buf          *bp,
        struct xfs_sb           *sbp)
 {
+       struct xfs_dsb          *dsb = XFS_BUF_TO_SBP(bp);
        uint32_t                agcount = 0;
        uint32_t                rem;
 
-       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+       if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
                xfs_warn(mp, "bad magic number");
                return -EWRONGFS;
        }
@@ -781,12 +782,14 @@ out_error:
 
 const struct xfs_buf_ops xfs_sb_buf_ops = {
        .name = "xfs_sb",
+       .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_read_verify,
        .verify_write = xfs_sb_write_verify,
 };
 
 const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
        .name = "xfs_sb_quiet",
+       .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
        .verify_read = xfs_sb_quiet_read_verify,
        .verify_write = xfs_sb_write_verify,
 };
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
        uint64_t        bfreelst = 0;
        uint64_t        btree = 0;
        uint64_t        fdblocks;
-       int             error;
+       int             error = 0;
 
        for (index = 0; index < agcount; index++) {
                /*
index 1c5debe748f0aca5431fa45daf7fbc078270378e..4e909791aeac48a9ca82c6eb5564ca8e2cc7cadc 100644 (file)
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
index 77d80106f989a74f26390f950763509c47cabca4..a0ccc253c43d0a4c5733c28086c2475c7be5a67b 100644 (file)
@@ -95,7 +95,7 @@ xfs_symlink_verify(
 
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return __this_address;
-       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+       if (!xfs_verify_magic(bp, dsl->sl_magic))
                return __this_address;
        if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
                return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
 
 const struct xfs_buf_ops xfs_symlink_buf_ops = {
        .name = "xfs_symlink",
+       .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
        .verify_read = xfs_symlink_read_verify,
        .verify_write = xfs_symlink_write_verify,
        .verify_struct = xfs_symlink_verify,
index 3306fc42cfad40b004bce74114deb2eae4164620..de310712dd6d12946b9ae0771194caf03c5a39ab 100644 (file)
@@ -115,6 +115,19 @@ xfs_verify_agino(
        return agino >= first && agino <= last;
 }
 
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino)
+{
+       return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
 /*
  * Verify that an FS inode number pointer neither points outside the
  * filesystem nor points at static AG metadata.
@@ -204,3 +217,14 @@ xfs_verify_icount(
        xfs_icount_range(mp, &min, &max);
        return icount >= min && icount <= max;
 }
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+       struct xfs_mount        *mp,
+       xfs_fileoff_t           dabno)
+{
+       xfs_dablk_t             max_dablk = -1U;
+
+       return dabno <= max_dablk;
+}
index 8f02855a019a41c5d1c481e044af88102792308c..c5a25403b4db40cd624b1431785f2b2c9bf32e6d 100644 (file)
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
                xfs_agino_t *first, xfs_agino_t *last);
 bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
                xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+               xfs_agino_t agino);
 bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
 bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
 
 #endif /* __XFS_TYPES_H__ */
index 90955ab1e89599303e64effcfa237cb8f25e5e73..ddf06bfaa29d6c189673093f8f3c480895557bc5 100644 (file)
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
        if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
                return;
        if (!have) {
-               if (agf->agf_freeblks != be32_to_cpu(0))
+               if (agf->agf_freeblks != cpu_to_be32(0))
                        xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
                return;
        }
@@ -864,19 +864,17 @@ xchk_agi(
 
        /* Check inode pointers */
        agino = be32_to_cpu(agi->agi_newino);
-       if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+       if (!xfs_verify_agino_or_null(mp, agno, agino))
                xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
        agino = be32_to_cpu(agi->agi_dirino);
-       if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+       if (!xfs_verify_agino_or_null(mp, agno, agino))
                xchk_block_set_corrupt(sc, sc->sa.agi_bp);
 
        /* Check unlinked inode buckets */
        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
                agino = be32_to_cpu(agi->agi_unlinked[i]);
-               if (agino == NULLAGINO)
-                       continue;
-               if (!xfs_verify_agino(mp, agno, agino))
+               if (!xfs_verify_agino_or_null(mp, agno, agino))
                        xchk_block_set_corrupt(sc, sc->sa.agi_bp);
        }
 
index 03d1e15ccebaa3364226a9985955d224cf9724a7..64e31f87d4907ada7d775ef3e3d6d729bdceeffb 100644 (file)
@@ -341,23 +341,19 @@ xrep_agf(
        struct xrep_find_ag_btree       fab[XREP_AGF_MAX] = {
                [XREP_AGF_BNOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
-                       .buf_ops = &xfs_allocbt_buf_ops,
-                       .magic = XFS_ABTB_CRC_MAGIC,
+                       .buf_ops = &xfs_bnobt_buf_ops,
                },
                [XREP_AGF_CNTBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
-                       .buf_ops = &xfs_allocbt_buf_ops,
-                       .magic = XFS_ABTC_CRC_MAGIC,
+                       .buf_ops = &xfs_cntbt_buf_ops,
                },
                [XREP_AGF_RMAPBT] = {
                        .rmap_owner = XFS_RMAP_OWN_AG,
                        .buf_ops = &xfs_rmapbt_buf_ops,
-                       .magic = XFS_RMAP_CRC_MAGIC,
                },
                [XREP_AGF_REFCOUNTBT] = {
                        .rmap_owner = XFS_RMAP_OWN_REFC,
                        .buf_ops = &xfs_refcountbt_buf_ops,
-                       .magic = XFS_REFC_CRC_MAGIC,
                },
                [XREP_AGF_END] = {
                        .buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
                [XREP_AGI_INOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_INOBT,
                        .buf_ops = &xfs_inobt_buf_ops,
-                       .magic = XFS_IBT_CRC_MAGIC,
                },
                [XREP_AGI_FINOBT] = {
                        .rmap_owner = XFS_RMAP_OWN_INOBT,
-                       .buf_ops = &xfs_inobt_buf_ops,
-                       .magic = XFS_FIBT_CRC_MAGIC,
+                       .buf_ops = &xfs_finobt_buf_ops,
                },
                [XREP_AGI_END] = {
                        .buf_ops = NULL
index 81d5e90547a1602a3c39fd4dfff6416b10637501..dce74ec570389a21204e40ddd14d4e1f619bebf9 100644 (file)
@@ -82,12 +82,23 @@ xchk_xattr_listent(
 
        sx = container_of(context, struct xchk_xattr, context);
 
+       if (xchk_should_terminate(sx->sc, &error)) {
+               context->seen_enough = 1;
+               return;
+       }
+
        if (flags & XFS_ATTR_INCOMPLETE) {
                /* Incomplete attr key, just mark the inode for preening. */
                xchk_ino_set_preen(sx->sc, context->dp->i_ino);
                return;
        }
 
+       /* Does this name make sense? */
+       if (!xfs_attr_namecheck(name, namelen)) {
+               xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+               return;
+       }
+
        args.flags = ATTR_KERNOTIME;
        if (flags & XFS_ATTR_ROOT)
                args.flags |= ATTR_ROOT;
index e1d11f3223e360d72bf7c3c175da8be56c3838e1..a703cd58a90e678854ac220f5661b9fb55b9ee8f 100644 (file)
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
        xchk_ag_free(info->sc, &info->sc->sa);
 }
 
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+       struct xfs_inode        *ip,
+       struct xchk_bmap_info   *info,
+       struct xfs_bmbt_irec    *irec)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           off;
+
+       if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+               return;
+
+       if (!xfs_verify_dablk(mp, irec->br_startoff))
+               xchk_fblock_set_corrupt(info->sc, info->whichfork,
+                               irec->br_startoff);
+
+       off = irec->br_startoff + irec->br_blockcount - 1;
+       if (!xfs_verify_dablk(mp, off))
+               xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
 /* Scrub a single extent record. */
 STATIC int
 xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
                                irec->br_startoff);
 
+       xchk_bmap_dirattr_extent(ip, info, irec);
+
        /* There should never be a "hole" extent in either extent list. */
        if (irec->br_startblock == HOLESTARTBLOCK)
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
index cd3e4d768a18ce2d6466c973dfe891174cc523cf..a38a22785a1a28e6a7a50b533c1103f5caf2ebd0 100644 (file)
@@ -129,6 +129,12 @@ xchk_dir_actor(
                goto out;
        }
 
+       /* Does this name make sense? */
+       if (!xfs_dir2_namecheck(name, namelen)) {
+               xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+               goto out;
+       }
+
        if (!strncmp(".", name, namelen)) {
                /* If this is "." then check that the inum matches the dir. */
                if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
index 882dc56c5c21e5c0f8d02ea705669eced2132664..700114f79a7d3085fbcea5cbdf90f76b3e74fce6 100644 (file)
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
 struct xchk_iallocbt {
        /* Number of inodes we see while scanning inobt. */
        unsigned long long      inodes;
+
+       /* Expected next startino, for big block filesystems. */
+       xfs_agino_t             next_startino;
+
+       /* Expected end of the current inode cluster. */
+       xfs_agino_t             next_cluster_ino;
 };
 
 /*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
        return hweight64(freemask);
 }
 
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record.  First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
 STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
        struct xchk_btree               *bs,
-       xfs_ino_t                       fsino,
-       xfs_agino_t                     chunkino,
-       xfs_agino_t                     clusterino,
        struct xfs_inobt_rec_incore     *irec,
-       struct xfs_buf                  *bp)
+       unsigned int                    irec_ino,
+       struct xfs_dinode               *dip)
 {
-       struct xfs_dinode               *dip;
        struct xfs_mount                *mp = bs->cur->bc_mp;
-       bool                            inode_is_free = false;
+       xfs_ino_t                       fsino;
+       xfs_agino_t                     agino;
+       bool                            irec_free;
+       bool                            ino_inuse;
        bool                            freemask_ok;
-       bool                            inuse;
        int                             error = 0;
 
        if (xchk_should_terminate(bs->sc, &error))
                return error;
 
-       dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+       /*
+        * Given an inobt record and the offset of an inode from the start of
+        * the record, compute which fs inode we're talking about.
+        */
+       agino = irec->ir_startino + irec_ino;
+       fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+       irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-           (dip->di_version >= 3 &&
-            be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+           (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
                goto out;
        }
 
-       if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
-               inode_is_free = true;
-       error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
-                       fsino + clusterino, &inuse);
+       error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+                       &ino_inuse);
        if (error == -ENODATA) {
                /* Not cached, just read the disk buffer */
-               freemask_ok = inode_is_free ^ !!(dip->di_mode);
+               freemask_ok = irec_free ^ !!(dip->di_mode);
                if (!bs->sc->try_harder && !freemask_ok)
                        return -EDEADLOCK;
        } else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
                goto out;
        } else {
                /* Inode is all there. */
-               freemask_ok = inode_is_free ^ inuse;
+               freemask_ok = irec_free ^ ino_inuse;
        }
        if (!freemask_ok)
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
        return 0;
 }
 
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk.  If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
 STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
        struct xchk_btree               *bs,
-       struct xfs_inobt_rec_incore     *irec)
+       struct xfs_inobt_rec_incore     *irec,
+       unsigned int                    cluster_base)
 {
        struct xfs_imap                 imap;
        struct xfs_mount                *mp = bs->cur->bc_mp;
        struct xfs_dinode               *dip;
-       struct xfs_buf                  *bp;
-       xfs_ino_t                       fsino;
-       xfs_agino_t                     nr_inodes;
-       xfs_agino_t                     agino;
-       xfs_agino_t                     chunkino;
-       xfs_agino_t                     clusterino;
+       struct xfs_buf                  *cluster_bp;
+       unsigned int                    nr_inodes;
+       xfs_agnumber_t                  agno = bs->cur->bc_private.a.agno;
        xfs_agblock_t                   agbno;
-       uint16_t                        holemask;
+       unsigned int                    cluster_index;
+       uint16_t                        cluster_mask = 0;
        uint16_t                        ir_holemask;
        int                             error = 0;
 
-       /* Make sure the freemask matches the inode records. */
-       nr_inodes = mp->m_inodes_per_cluster;
-
-       for (agino = irec->ir_startino;
-            agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
-            agino += mp->m_inodes_per_cluster) {
-               fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
-               chunkino = agino - irec->ir_startino;
-               agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
-               /* Compute the holemask mask for this cluster. */
-               for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
-                    clusterino += XFS_INODES_PER_HOLEMASK_BIT)
-                       holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
-                                       XFS_INODES_PER_HOLEMASK_BIT);
-
-               /* The whole cluster must be a hole or not a hole. */
-               ir_holemask = (irec->ir_holemask & holemask);
-               if (ir_holemask != holemask && ir_holemask != 0) {
+       nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+                       mp->m_inodes_per_cluster);
+
+       /* Map this inode cluster */
+       agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+       /* Compute a bitmask for this cluster that can be used for holemask. */
+       for (cluster_index = 0;
+            cluster_index < nr_inodes;
+            cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+               cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+                               XFS_INODES_PER_HOLEMASK_BIT);
+
+       /*
+        * Map the first inode of this cluster to a buffer and offset.
+        * Be careful about inobt records that don't align with the start of
+        * the inode buffer when block sizes are large enough to hold multiple
+        * inode chunks.  When this happens, cluster_base will be zero but
+        * ir_startino can be large enough to make im_boffset nonzero.
+        */
+       ir_holemask = (irec->ir_holemask & cluster_mask);
+       imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+       imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+       imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+       if (imap.im_boffset != 0 && cluster_base != 0) {
+               ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return 0;
+       }
+
+       trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+                       imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+                       cluster_mask, ir_holemask,
+                       XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+                                         cluster_base));
+
+       /* The whole cluster must be a hole or not a hole. */
+       if (ir_holemask != cluster_mask && ir_holemask != 0) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return 0;
+       }
+
+       /* If any part of this is a hole, skip it. */
+       if (ir_holemask) {
+               xchk_xref_is_not_owned_by(bs->sc, agbno,
+                               mp->m_blocks_per_cluster,
+                               &XFS_RMAP_OINFO_INODES);
+               return 0;
+       }
+
+       xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+                       &XFS_RMAP_OINFO_INODES);
+
+       /* Grab the inode cluster buffer. */
+       error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+                       0, 0);
+       if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+               return error;
+
+       /* Check free status of each inode within this cluster. */
+       for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+               struct xfs_dinode       *dip;
+
+               if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
                        xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-                       continue;
+                       break;
                }
 
-               /* If any part of this is a hole, skip it. */
-               if (ir_holemask) {
-                       xchk_xref_is_not_owned_by(bs->sc, agbno,
-                                       mp->m_blocks_per_cluster,
-                                       &XFS_RMAP_OINFO_INODES);
-                       continue;
+               dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+               error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+                               cluster_base + cluster_index, dip);
+               if (error)
+                       break;
+               imap.im_boffset += mp->m_sb.sb_inodesize;
+       }
+
+       xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+       return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+       struct xchk_btree               *bs,
+       struct xfs_inobt_rec_incore     *irec)
+{
+       unsigned int                    cluster_base;
+       int                             error = 0;
+
+       /*
+        * For the common case where this inobt record maps to multiple inode
+        * clusters this will call _check_cluster for each cluster.
+        *
+        * For the case that multiple inobt records map to a single cluster,
+        * this will call _check_cluster once.
+        */
+       for (cluster_base = 0;
+            cluster_base < XFS_INODES_PER_CHUNK;
+            cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+               error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+               if (error)
+                       break;
+       }
+
+       return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly.  Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk.  This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+       struct xchk_btree               *bs,
+       struct xfs_inobt_rec_incore     *irec)
+{
+       struct xfs_mount                *mp = bs->sc->mp;
+       struct xchk_iallocbt            *iabt = bs->private;
+
+       /*
+        * finobt records have different positioning requirements than inobt
+        * records: each finobt record must have a corresponding inobt record.
+        * That is checked in the xref function, so for now we only catch the
+        * obvious case where the record isn't at all aligned properly.
+        *
+        * Note that if a fs block contains more than a single chunk of inodes,
+        * we will have finobt records only for those chunks containing free
+        * inodes, and therefore expect chunk alignment of finobt records.
+        * Otherwise, we expect that the finobt record is aligned to the
+        * cluster alignment as told by the superblock.
+        */
+       if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+               unsigned int    imask;
+
+               imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+                               mp->m_cluster_align_inodes) - 1;
+               if (irec->ir_startino & imask)
+                       xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
+       }
+
+       if (iabt->next_startino != NULLAGINO) {
+               /*
+                * We're midway through a cluster of inodes that is mapped by
+                * multiple inobt records.  Did we get the record for the next
+                * irec in the sequence?
+                */
+               if (irec->ir_startino != iabt->next_startino) {
+                       xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+                       return;
                }
 
-               xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
-                               &XFS_RMAP_OINFO_INODES);
+               iabt->next_startino += XFS_INODES_PER_CHUNK;
 
-               /* Grab the inode cluster buffer. */
-               imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
-                               agbno);
-               imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
-               imap.im_boffset = 0;
-
-               error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
-                               &dip, &bp, 0, 0);
-               if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
-                               &error))
-                       continue;
-
-               /* Which inodes are free? */
-               for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
-                       error = xchk_iallocbt_check_cluster_freemask(bs,
-                                       fsino, chunkino, clusterino, irec, bp);
-                       if (error) {
-                               xfs_trans_brelse(bs->cur->bc_tp, bp);
-                               return error;
-                       }
+               /* Are we done with the cluster? */
+               if (iabt->next_startino >= iabt->next_cluster_ino) {
+                       iabt->next_startino = NULLAGINO;
+                       iabt->next_cluster_ino = NULLAGINO;
                }
+               return;
+       }
+
+       /* inobt records must be aligned to cluster and inoalignmnt size. */
+       if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
+       }
 
-               xfs_trans_brelse(bs->cur->bc_tp, bp);
+       if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+               return;
        }
 
-       return error;
+       if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+               return;
+
+       /*
+        * If this is the start of an inode cluster that can be mapped by
+        * multiple inobt records, the next inobt record must follow exactly
+        * after this one.
+        */
+       iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+       iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
 }
 
 /* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
        uint64_t                        holes;
        xfs_agnumber_t                  agno = bs->cur->bc_private.a.agno;
        xfs_agino_t                     agino;
-       xfs_agblock_t                   agbno;
        xfs_extlen_t                    len;
        int                             holecount;
        int                             i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
                goto out;
        }
 
-       /* Make sure this record is aligned to cluster and inoalignmnt size. */
-       agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
-       if ((agbno & (mp->m_cluster_align - 1)) ||
-           (agbno & (mp->m_blocks_per_cluster - 1)))
-               xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+       xchk_iallocbt_rec_alignment(bs, &irec);
+       if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+               goto out;
 
        iabt->inodes += irec.ir_count;
 
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
 
                if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
                        goto out;
-               goto check_freemask;
+               goto check_clusters;
        }
 
        /* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
            holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
                xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 
-check_freemask:
-       error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+       error = xchk_iallocbt_check_clusters(bs, &irec);
        if (error)
                goto out;
 
@@ -429,6 +583,8 @@ xchk_iallocbt(
        struct xfs_btree_cur    *cur;
        struct xchk_iallocbt    iabt = {
                .inodes         = 0,
+               .next_startino  = NULLAGINO,
+               .next_cluster_ino = NULLAGINO,
        };
        int                     error;
 
index 6acf1bfa0bfee57e6d5f105e47797c4126d9d88b..f28f4bad317b6792f73f45ac7ac118a1c40f1060 100644 (file)
@@ -743,7 +743,8 @@ xrep_findroot_block(
 
        /* Ensure the block magic matches the btree type we're looking for. */
        btblock = XFS_BUF_TO_BLOCK(bp);
-       if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+       ASSERT(fab->buf_ops->magic[1] != 0);
+       if (btblock->bb_magic != fab->buf_ops->magic[1])
                goto out;
 
        /*
index f2fc18bb760520003dde1b3530dca6c42368619c..d990314eb08b2b0b9a271ee07c2923c27d921b67 100644 (file)
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
        /* in: buffer ops */
        const struct xfs_buf_ops        *buf_ops;
 
-       /* in: magic number of the btree */
-       uint32_t                        magic;
-
        /* out: the highest btree block found and the tree height */
        xfs_agblock_t                   root;
        unsigned int                    height;
index 665d4bbb17cc8c3b9718a4f1f24f100e31414412..dbe115b075f714007aef48b16e8b765629f6c284 100644 (file)
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
        startext = fsbno;
        endext = fsbno + len - 1;
        do_div(startext, sc->mp->m_sb.sb_rextsize);
-       if (do_div(endext, sc->mp->m_sb.sb_rextsize))
-               endext++;
-       extcount = endext - startext;
+       do_div(endext, sc->mp->m_sb.sb_rextsize);
+       extcount = endext - startext + 1;
        xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
        error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
                        &is_free);
index 8344b14031efa9ab0db697699cfbfabdcdace0a6..3c83e8b3b39c17e5b1e52110ed5bfa966c383e3a 100644 (file)
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
                  __entry->ret_ip)
 );
 
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agino_t startino, xfs_daddr_t map_daddr,
+                unsigned short map_len, unsigned int chunk_ino,
+                unsigned int nr_inodes, uint16_t cluster_mask,
+                uint16_t holemask, unsigned int cluster_ino),
+       TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+               cluster_mask, holemask, cluster_ino),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, startino)
+               __field(xfs_daddr_t, map_daddr)
+               __field(unsigned short, map_len)
+               __field(unsigned int, chunk_ino)
+               __field(unsigned int, nr_inodes)
+               __field(unsigned int, cluster_ino)
+               __field(uint16_t, cluster_mask)
+               __field(uint16_t, holemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->startino = startino;
+               __entry->map_daddr = map_daddr;
+               __entry->map_len = map_len;
+               __entry->chunk_ino = chunk_ino;
+               __entry->nr_inodes = nr_inodes;
+               __entry->cluster_mask = cluster_mask;
+               __entry->holemask = holemask;
+               __entry->cluster_ino = cluster_ino;
+       ),
+       TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->startino,
+                 __entry->map_daddr,
+                 __entry->map_len,
+                 __entry->chunk_ino,
+                 __entry->nr_inodes,
+                 __entry->cluster_mask,
+                 __entry->holemask,
+                 __entry->cluster_ino)
+)
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
index d9048bcea49c5203c6d89186637d63fb33f69c37..7b8bb6bde981028ad692fa07c87b0e4911bf3436 100644 (file)
@@ -28,7 +28,8 @@
  */
 struct xfs_writepage_ctx {
        struct xfs_bmbt_irec    imap;
-       unsigned int            io_type;
+       int                     fork;
+       unsigned int            data_seq;
        unsigned int            cow_seq;
        struct xfs_ioend        *ioend;
 };
@@ -255,30 +256,20 @@ xfs_end_io(
         */
        error = blk_status_to_errno(ioend->io_bio->bi_status);
        if (unlikely(error)) {
-               switch (ioend->io_type) {
-               case XFS_IO_COW:
+               if (ioend->io_fork == XFS_COW_FORK)
                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
-                       break;
-               }
-
                goto done;
        }
 
        /*
-        * Success:  commit the COW or unwritten blocks if needed.
+        * Success: commit the COW or unwritten blocks if needed.
         */
-       switch (ioend->io_type) {
-       case XFS_IO_COW:
+       if (ioend->io_fork == XFS_COW_FORK)
                error = xfs_reflink_end_cow(ip, offset, size);
-               break;
-       case XFS_IO_UNWRITTEN:
-               /* writeback should never update isize */
+       else if (ioend->io_state == XFS_EXT_UNWRITTEN)
                error = xfs_iomap_write_unwritten(ip, offset, size, false);
-               break;
-       default:
+       else
                ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
-               break;
-       }
 
 done:
        if (ioend->io_append_trans)
@@ -293,7 +284,8 @@ xfs_end_bio(
        struct xfs_ioend        *ioend = bio->bi_private;
        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 
-       if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+       if (ioend->io_fork == XFS_COW_FORK ||
+           ioend->io_state == XFS_EXT_UNWRITTEN)
                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
        else if (ioend->io_append_trans)
                queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +293,75 @@ xfs_end_bio(
                xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 }
 
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+       struct xfs_writepage_ctx        *wpc,
+       struct xfs_inode                *ip,
+       xfs_fileoff_t                   offset_fsb)
+{
+       if (offset_fsb < wpc->imap.br_startoff ||
+           offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+               return false;
+       /*
+        * If this is a COW mapping, it is sufficient to check that the mapping
+        * covers the offset. Be careful to check this first because the caller
+        * can revalidate a COW mapping without updating the data seqno.
+        */
+       if (wpc->fork == XFS_COW_FORK)
+               return true;
+
+       /*
+        * This is not a COW mapping. Check the sequence number of the data fork
+        * because concurrent changes could have invalidated the extent. Check
+        * the COW fork because concurrent changes since the last time we
+        * checked (and found nothing at this offset) could have added
+        * overlapping blocks.
+        */
+       if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+               return false;
+       if (xfs_inode_has_cow_data(ip) &&
+           wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+               return false;
+       return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+       struct xfs_writepage_ctx *wpc,
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           offset_fsb)
+{
+       int                     error;
+
+       /*
+        * Attempt to allocate whatever delalloc extent currently backs
+        * offset_fsb and put the result into wpc->imap.  Allocate in a loop
+        * because it may take several attempts to allocate real blocks for a
+        * contiguous delalloc extent if free space is sufficiently fragmented.
+        */
+       do {
+               error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+                               &wpc->imap, wpc->fork == XFS_COW_FORK ?
+                                       &wpc->cow_seq : &wpc->data_seq);
+               if (error)
+                       return error;
+       } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+       return 0;
+}
+
 STATIC int
 xfs_map_blocks(
        struct xfs_writepage_ctx *wpc,
@@ -310,26 +371,16 @@ xfs_map_blocks(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        ssize_t                 count = i_blocksize(inode);
-       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
        xfs_fileoff_t           cow_fsb = NULLFILEOFF;
        struct xfs_bmbt_irec    imap;
-       int                     whichfork = XFS_DATA_FORK;
        struct xfs_iext_cursor  icur;
-       bool                    imap_valid;
+       int                     retries = 0;
        int                     error = 0;
 
-       /*
-        * We have to make sure the cached mapping is within EOF to protect
-        * against eofblocks trimming on file release leaving us with a stale
-        * mapping. Otherwise, a page for a subsequent file extending buffered
-        * write could get picked up by this writeback cycle and written to the
-        * wrong blocks.
-        *
-        * Note that what we really want here is a generic mapping invalidation
-        * mechanism to protect us from arbitrary extent modifying contexts, not
-        * just eofblocks.
-        */
-       xfs_trim_extent_eof(&wpc->imap, ip);
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
 
        /*
         * COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +397,19 @@ xfs_map_blocks(
         * against concurrent updates and provides a memory barrier on the way
         * out that ensures that we always see the current value.
         */
-       imap_valid = offset_fsb >= wpc->imap.br_startoff &&
-                    offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
-       if (imap_valid &&
-           (!xfs_inode_has_cow_data(ip) ||
-            wpc->io_type == XFS_IO_COW ||
-            wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+       if (xfs_imap_valid(wpc, ip, offset_fsb))
                return 0;
 
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-
        /*
         * If we don't have a valid map, now it's time to get a new one for this
         * offset.  This will convert delayed allocations (including COW ones)
         * into real extents.  If we return without a valid map, it means we
         * landed in a hole and we skip the block.
         */
+retry:
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               (ip->i_df.if_flags & XFS_IFEXTENTS));
-       ASSERT(offset <= mp->m_super->s_maxbytes);
-
-       if (offset > mp->m_super->s_maxbytes - count)
-               count = mp->m_super->s_maxbytes - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 
        /*
         * Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +421,16 @@ xfs_map_blocks(
        if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
                wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               /*
-                * Truncate can race with writeback since writeback doesn't
-                * take the iolock and truncate decreases the file size before
-                * it starts truncating the pages between new_size and old_size.
-                * Therefore, we can end up in the situation where writeback
-                * gets a CoW fork mapping but the truncate makes the mapping
-                * invalid and we end up in here trying to get a new mapping.
-                * bail out here so that we simply never get a valid mapping
-                * and so we drop the write altogether.  The page truncation
-                * will kill the contents anyway.
-                */
-               if (offset > i_size_read(inode)) {
-                       wpc->io_type = XFS_IO_HOLE;
-                       return 0;
-               }
-               whichfork = XFS_COW_FORK;
-               wpc->io_type = XFS_IO_COW;
+
+               wpc->fork = XFS_COW_FORK;
                goto allocate_blocks;
        }
 
        /*
-        * Map valid and no COW extent in the way?  We're done.
+        * No COW extent overlap. Revalidate now that we may have updated
+        * ->cow_seq. If the data mapping is still valid, we're done.
         */
-       if (imap_valid) {
+       if (xfs_imap_valid(wpc, ip, offset_fsb)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return 0;
        }
@@ -417,51 +442,65 @@ xfs_map_blocks(
         */
        if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
                imap.br_startoff = end_fsb;     /* fake a hole past EOF */
+       wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
+       wpc->fork = XFS_DATA_FORK;
+
+       /* landed in a hole or beyond EOF? */
        if (imap.br_startoff > offset_fsb) {
-               /* landed in a hole or beyond EOF */
                imap.br_blockcount = imap.br_startoff - offset_fsb;
                imap.br_startoff = offset_fsb;
                imap.br_startblock = HOLESTARTBLOCK;
-               wpc->io_type = XFS_IO_HOLE;
-       } else {
-               /*
-                * Truncate to the next COW extent if there is one.  This is the
-                * only opportunity to do this because we can skip COW fork
-                * lookups for the subsequent blocks in the mapping; however,
-                * the requirement to treat the COW range separately remains.
-                */
-               if (cow_fsb != NULLFILEOFF &&
-                   cow_fsb < imap.br_startoff + imap.br_blockcount)
-                       imap.br_blockcount = cow_fsb - imap.br_startoff;
-
-               if (isnullstartblock(imap.br_startblock)) {
-                       /* got a delalloc extent */
-                       wpc->io_type = XFS_IO_DELALLOC;
-                       goto allocate_blocks;
-               }
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
-                       wpc->io_type = XFS_IO_UNWRITTEN;
-               else
-                       wpc->io_type = XFS_IO_OVERWRITE;
+               imap.br_state = XFS_EXT_NORM;
        }
 
+       /*
+        * Truncate to the next COW extent if there is one.  This is the only
+        * opportunity to do this because we can skip COW fork lookups for the
+        * subsequent blocks in the mapping; however, the requirement to treat
+        * the COW range separately remains.
+        */
+       if (cow_fsb != NULLFILEOFF &&
+           cow_fsb < imap.br_startoff + imap.br_blockcount)
+               imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+       /* got a delalloc extent? */
+       if (imap.br_startblock != HOLESTARTBLOCK &&
+           isnullstartblock(imap.br_startblock))
+               goto allocate_blocks;
+
        wpc->imap = imap;
-       xfs_trim_extent_eof(&wpc->imap, ip);
-       trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+       trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
        return 0;
 allocate_blocks:
-       error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
-                       &wpc->cow_seq);
-       if (error)
+       error = xfs_convert_blocks(wpc, ip, offset_fsb);
+       if (error) {
+               /*
+                * If we failed to find the extent in the COW fork we might have
+                * raced with a COW to data fork conversion or truncate.
+                * Restart the lookup to catch the extent in the data fork for
+                * the former case, but prevent additional retries to avoid
+                * looping forever for the latter case.
+                */
+               if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+                       goto retry;
+               ASSERT(error != -EAGAIN);
                return error;
-       ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
-              imap.br_startoff + imap.br_blockcount <= cow_fsb);
-       wpc->imap = imap;
-       xfs_trim_extent_eof(&wpc->imap, ip);
-       trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+       }
+
+       /*
+        * Due to merging the return real extent might be larger than the
+        * original delalloc one.  Trim the return extent to the next COW
+        * boundary again to force a re-lookup.
+        */
+       if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+           cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+               wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+       ASSERT(wpc->imap.br_startoff <= offset_fsb);
+       ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+       trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
        return 0;
 }
 
@@ -486,7 +525,7 @@ xfs_submit_ioend(
        int                     status)
 {
        /* Convert CoW extents to regular */
-       if (!status && ioend->io_type == XFS_IO_COW) {
+       if (!status && ioend->io_fork == XFS_COW_FORK) {
                /*
                 * Yuk. This can do memory allocation, but is not a
                 * transactional operation so everything is done in GFP_KERNEL
@@ -504,7 +543,8 @@ xfs_submit_ioend(
 
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
-           ioend->io_type != XFS_IO_UNWRITTEN &&
+           (ioend->io_fork == XFS_COW_FORK ||
+            ioend->io_state != XFS_EXT_UNWRITTEN) &&
            xfs_ioend_is_append(ioend) &&
            !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
@@ -533,7 +573,8 @@ xfs_submit_ioend(
 static struct xfs_ioend *
 xfs_alloc_ioend(
        struct inode            *inode,
-       unsigned int            type,
+       int                     fork,
+       xfs_exntst_t            state,
        xfs_off_t               offset,
        struct block_device     *bdev,
        sector_t                sector)
@@ -547,7 +588,8 @@ xfs_alloc_ioend(
 
        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
        INIT_LIST_HEAD(&ioend->io_list);
-       ioend->io_type = type;
+       ioend->io_fork = fork;
+       ioend->io_state = state;
        ioend->io_inode = inode;
        ioend->io_size = 0;
        ioend->io_offset = offset;
@@ -608,13 +650,15 @@ xfs_add_to_ioend(
        sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
                ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
 
-       if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+       if (!wpc->ioend ||
+           wpc->fork != wpc->ioend->io_fork ||
+           wpc->imap.br_state != wpc->ioend->io_state ||
            sector != bio_end_sector(wpc->ioend->io_bio) ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
-               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
-                               bdev, sector);
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+                               wpc->imap.br_state, offset, bdev, sector);
        }
 
        if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
@@ -723,7 +767,7 @@ xfs_writepage_map(
                error = xfs_map_blocks(wpc, inode, file_offset);
                if (error)
                        break;
-               if (wpc->io_type == XFS_IO_HOLE)
+               if (wpc->imap.br_startblock == HOLESTARTBLOCK)
                        continue;
                xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
                                 &submit_list);
@@ -918,9 +962,7 @@ xfs_vm_writepage(
        struct page             *page,
        struct writeback_control *wbc)
 {
-       struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_HOLE,
-       };
+       struct xfs_writepage_ctx wpc = { };
        int                     ret;
 
        ret = xfs_do_writepage(page, wbc, &wpc);
@@ -934,9 +976,7 @@ xfs_vm_writepages(
        struct address_space    *mapping,
        struct writeback_control *wbc)
 {
-       struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_HOLE,
-       };
+       struct xfs_writepage_ctx wpc = { };
        int                     ret;
 
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -983,7 +1023,7 @@ xfs_vm_bmap(
         * Since we don't pass back blockdev info, we can't return bmap
         * information for rt files either.
         */
-       if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+       if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
                return 0;
        return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
index e5c23948a8ab3dde1007113eeaab24edcb48e4c5..6c2615b83c5d863ea8db349ea4ffa8be72d66e61 100644 (file)
@@ -8,33 +8,13 @@
 
 extern struct bio_set xfs_ioend_bioset;
 
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
-       XFS_IO_HOLE,            /* covers region without any block allocation */
-       XFS_IO_DELALLOC,        /* covers delalloc region */
-       XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
-       XFS_IO_OVERWRITE,       /* covers already allocated extent */
-       XFS_IO_COW,             /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
-       { XFS_IO_HOLE,                  "hole" },       \
-       { XFS_IO_DELALLOC,              "delalloc" },   \
-       { XFS_IO_UNWRITTEN,             "unwritten" },  \
-       { XFS_IO_OVERWRITE,             "overwrite" },  \
-       { XFS_IO_COW,                   "CoW" }
-
 /*
  * Structure for buffered I/O completions.
  */
 struct xfs_ioend {
        struct list_head        io_list;        /* next ioend in chain */
-       unsigned int            io_type;        /* delalloc / unwritten */
+       int                     io_fork;        /* inode fork written back */
+       xfs_exntst_t            io_state;       /* extent state */
        struct inode            *io_inode;      /* file being written to */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
index a58034049995b4c6a7db190164ea886ec5113dff..3d213a7394c5b747dfb5cffc17dfb3d44d66cf03 100644 (file)
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
        attrlist_ent_t *aep;
        int arraytop;
 
+       ASSERT(!context->seen_enough);
        ASSERT(!(context->flags & ATTR_KERNOVAL));
        ASSERT(context->count >= 0);
        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
index 1ee8c5539fa4f2e999808acc021d63be0b4963b0..2db43ff4f8b59d303b70a0dbf537a45bbf897706 100644 (file)
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
         * by virtue of the hole punch.
         */
        error = xfs_free_file_space(ip, offset, len);
-       if (error)
-               goto out;
+       if (error || xfs_is_always_cow_inode(ip))
+               return error;
 
-       error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+       return xfs_alloc_file_space(ip, round_down(offset, blksize),
                                     round_up(offset + len, blksize) -
                                     round_down(offset, blksize),
                                     XFS_BMAPI_PREALLOC);
-out:
-       return error;
-
 }
 
 static int
index 4f5f2ff3f70f944130f94a674f09f464d2b6c970..548344e2512833bbb82f141fe34aefed88a6729e 100644 (file)
@@ -776,29 +776,24 @@ _xfs_buf_read(
 }
 
 /*
- * Set buffer ops on an unchecked buffer and validate it, if possible.
+ * Reverify a buffer found in cache without an attached ->b_ops.
  *
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents.  If the contents
- * cannot be verified, we'll clear XBF_DONE.  We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
  *
- * Under normal operations, every in-core buffer must have buffer ops assigned
- * to them when the buffer is read in from disk so that we can validate the
- * metadata.
- *
- * However, there are two scenarios where one can encounter in-core buffers
- * that don't have buffer ops.  The first is during log recovery of buffers on
- * a V4 filesystem, though these buffers are purged at the end of recovery.
- *
- * The other is online repair, which tries to match arbitrary metadata blocks
- * with btree types in order to find the root.  If online repair doesn't match
- * the buffer with /any/ btree type, the buffer remains in memory in DONE state
- * with no ops, and a subsequent read_buf call from elsewhere will not set the
- * ops.  This function helps us fix this situation.
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type.  If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
  */
 int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
        struct xfs_buf          *bp,
        const struct xfs_buf_ops *ops)
 {
@@ -840,7 +835,7 @@ xfs_buf_read_map(
                return bp;
        }
 
-       xfs_buf_ensure_ops(bp, ops);
+       xfs_buf_reverify(bp, ops);
 
        if (flags & XBF_ASYNC) {
                /*
@@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 
        atomic_set(&bp->b_lru_ref, lru_ref);
 }
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+       struct xfs_buf          *bp,
+       __be32                  dmagic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       int                     idx;
+
+       idx = xfs_sb_version_hascrc(&mp->m_sb);
+       if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+               return false;
+       return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+       struct xfs_buf          *bp,
+       __be16                  dmagic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       int                     idx;
+
+       idx = xfs_sb_version_hascrc(&mp->m_sb);
+       if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+               return false;
+       return dmagic == bp->b_ops->magic16[idx];
+}
index b9f5511ea998a22927f141ecf446f63e3c99f60c..d0b96e071cec197a39ea7cf4c67f777f1bebb046 100644 (file)
@@ -125,6 +125,10 @@ struct xfs_buf_map {
 
 struct xfs_buf_ops {
        char *name;
+       union {
+               __be32 magic[2];        /* v4 and v5 on disk magic values */
+               __be16 magic16[2];      /* v4 and v5 on disk magic values */
+       };
        void (*verify_read)(struct xfs_buf *);
        void (*verify_write)(struct xfs_buf *);
        xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)   block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)  bdev_read_only((buftarg)->bt_bdev)
 
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
 
 #endif /* __XFS_BUF_H__ */
index 9866f542e77b18f20689531821856be972c39bdf..a1e177f66404d28184fd99a5711d2c6f0b5005c2 100644 (file)
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
        XFS_RANDOM_BUF_LRU_REF,
        XFS_RANDOM_FORCE_SCRUB_REPAIR,
        XFS_RANDOM_FORCE_SUMMARY_RECALC,
+       XFS_RANDOM_IUNLINK_FALLBACK,
 };
 
 struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin,  XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,      XFS_ERRTAG_BUF_LRU_REF);
 XFS_ERRORTAG_ATTR_RW(force_repair,     XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 XFS_ERRORTAG_ATTR_RW(bad_summary,      XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
 
 static struct attribute *xfs_errortag_attrs[] = {
        XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
        XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
        XFS_ERRORTAG_ATTR_LIST(force_repair),
        XFS_ERRORTAG_ATTR_LIST(bad_summary),
+       XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
        NULL,
 };
 
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
        fa = failaddr ? failaddr : __return_address;
        __xfs_buf_ioerror(bp, error, fa);
 
-       xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+       xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+                 "Metadata %s detected at %pS, %s block 0x%llx %s",
                  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
                  fa, bp->b_ops->name, bp->b_bn, name);
 
index 246d3e989c6c92770ac0dff3af1bcb5dbdcb23c0..602aa7d62b66e09a8133a385805571fe20cf68a5 100644 (file)
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define                XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
 #define                XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define                XFS_PTAG_FSBLOCK_ZERO           0x00000080
+#define                XFS_PTAG_VERIFIER_ERROR         0x00000100
 
 #endif /* __XFS_ERROR_H__ */
index e47425071e654473f4b34e7899015cecce19ef5e..770cc2edf777f4bb3ef6089986d5d49f75788ee4 100644 (file)
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
                 * We can't properly handle unaligned direct I/O to reflink
                 * files yet, as we can't unshare a partial block.
                 */
-               if (xfs_is_reflink_inode(ip)) {
+               if (xfs_is_cow_inode(ip)) {
                        trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
                        return -EREMCHG;
                }
@@ -872,14 +872,27 @@ xfs_file_fallocate(
                                goto out_unlock;
                }
 
-               if (mode & FALLOC_FL_ZERO_RANGE)
+               if (mode & FALLOC_FL_ZERO_RANGE) {
                        error = xfs_zero_file_space(ip, offset, len);
-               else {
-                       if (mode & FALLOC_FL_UNSHARE_RANGE) {
-                               error = xfs_reflink_unshare(ip, offset, len);
-                               if (error)
-                                       goto out_unlock;
+               } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+                       error = xfs_reflink_unshare(ip, offset, len);
+                       if (error)
+                               goto out_unlock;
+
+                       if (!xfs_is_always_cow_inode(ip)) {
+                               error = xfs_alloc_file_space(ip, offset, len,
+                                               XFS_BMAPI_PREALLOC);
                        }
+               } else {
+                       /*
+                        * If always_cow mode we can't use preallocations and
+                        * thus should not create them.
+                        */
+                       if (xfs_is_always_cow_inode(ip)) {
+                               error = -EOPNOTSUPP;
+                               goto out_unlock;
+                       }
+
                        error = xfs_alloc_file_space(ip, offset, len,
                                                     XFS_BMAPI_PREALLOC);
                }
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
        default:
                return generic_file_llseek(file, offset, whence);
        case SEEK_HOLE:
-               offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+               offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
                break;
        case SEEK_DATA:
-               offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+               offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
                break;
        }
 
index f3ef70c542e1bc3392b391fefe904b787a0920a5..584648582ba717be54e61f7829085025acbc2ea3 100644 (file)
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
        int                     error = 0;
        int                     err2;
 
+       mp->m_finobt_nores = false;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                pag = xfs_perag_get(mp, agno);
                err2 = xfs_ag_resv_init(pag, NULL);
index 5169e84ae38255a9438d2f7425c534b4954b9c9f..d0d37738412009355957661fe7e47a18c7522de4 100644 (file)
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
-       .panic_mask     = {     0,              0,              255     },
+       .panic_mask     = {     0,              0,              256     },
        .error_level    = {     0,              3,              11      },
        .syncd_timer    = {     1*100,          30*100,         7200*100},
        .stats_clear    = {     0,              0,              1       },
index ae667ba74a1c3a4975197aad3e0a2ef4073aa7fa..f643a92951794e3f73951a0abc4b9a3d4b16bdbc 100644 (file)
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
        if (error)
                goto out_trans_cancel;
 
-       error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+       error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
        if (error)
                goto out_trans_cancel;
 
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
         * now remains allocated and sits on the unlinked list until the fs is
         * repaired.
         */
-       if (unlikely(mp->m_inotbt_nores)) {
+       if (unlikely(mp->m_finobt_nores)) {
                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
                                XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
                                &tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
 }
 
 /*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory.  Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain.  This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations?  If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk.  That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk.  The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+       struct rhash_head       iu_rhash_head;
+       xfs_agino_t             iu_agino;               /* X */
+       xfs_agino_t             iu_next_unlinked;       /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+       struct rhashtable_compare_arg   *arg,
+       const void                      *obj)
+{
+       const xfs_agino_t               *key = arg->key;
+       const struct xfs_iunlink        *iu = obj;
+
+       if (iu->iu_next_unlinked != *key)
+               return 1;
+       return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+       .min_size               = XFS_AGI_UNLINKED_BUCKETS,
+       .key_len                = sizeof(xfs_agino_t),
+       .key_offset             = offsetof(struct xfs_iunlink,
+                                          iu_next_unlinked),
+       .head_offset            = offsetof(struct xfs_iunlink, iu_rhash_head),
+       .automatic_shrinking    = true,
+       .obj_cmpfn              = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino.  Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             agino)
+{
+       struct xfs_iunlink      *iu;
+
+       iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+                       xfs_iunlink_hash_params);
+       return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+       struct xfs_perag        *pag,
+       struct xfs_iunlink      *iu)
+{
+       int                     error;
+
+       error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+                       &iu->iu_rhash_head, xfs_iunlink_hash_params);
+       /*
+        * Fail loudly if there already was an entry because that's a sign of
+        * corruption of in-memory data.  Also fail loudly if we see an error
+        * code we didn't anticipate from the rhashtable code.  Currently we
+        * only anticipate ENOMEM.
+        */
+       if (error) {
+               WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+               kmem_free(iu);
+       }
+       /*
+        * Absorb any runtime errors that aren't a result of corruption because
+        * this is a cache and we can always fall back to bucket list scanning.
+        */
+       if (error != 0 && error != -EEXIST)
+               error = 0;
+       return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             prev_agino,
+       xfs_agino_t             this_agino)
+{
+       struct xfs_iunlink      *iu;
+
+       if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+               return 0;
+
+       iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+       iu->iu_agino = prev_agino;
+       iu->iu_next_unlinked = this_agino;
+
+       return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit.  If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+       struct xfs_perag        *pag,
+       xfs_agino_t             agino,
+       xfs_agino_t             next_unlinked)
+{
+       struct xfs_iunlink      *iu;
+       int                     error;
+
+       /* Look up the old entry; if there wasn't one then exit. */
+       iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+                       xfs_iunlink_hash_params);
+       if (!iu)
+               return 0;
+
+       /*
+        * Remove the entry.  This shouldn't ever return an error, but if we
+        * couldn't remove the old entry we don't want to add it again to the
+        * hash table, and if the entry disappeared on us then someone's
+        * violated the locking rules and we need to fail loudly.  Either way
+        * we cannot remove the inode because internal state is or would have
+        * been corrupt.
+        */
+       error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+                       &iu->iu_rhash_head, xfs_iunlink_hash_params);
+       if (error)
+               return error;
+
+       /* If there is no new next entry just free our item and return. */
+       if (next_unlinked == NULLAGINO) {
+               kmem_free(iu);
+               return 0;
+       }
+
+       /* Update the entry and re-add it to the hash table. */
+       iu->iu_next_unlinked = next_unlinked;
+       return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+       struct xfs_perag        *pag)
+{
+       return rhashtable_init(&pag->pagi_unlinked_hash,
+                       &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+       void                    *ptr,
+       void                    *arg)
+{
+       struct xfs_iunlink      *iu = ptr;
+       bool                    *freed_anything = arg;
+
+       *freed_anything = true;
+       kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+       struct xfs_perag        *pag)
+{
+       bool                    freed_anything = false;
+
+       rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+                       xfs_iunlink_free_item, &freed_anything);
+
+       ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results.  The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       struct xfs_buf          *agibp,
+       unsigned int            bucket_index,
+       xfs_agino_t             new_agino)
+{
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agibp);
+       xfs_agino_t             old_value;
+       int                     offset;
+
+       ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+       old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+                       old_value, new_agino);
+
+       /*
+        * We should never find the head of the list already set to the value
+        * passed in because either we're adding or removing ourselves from the
+        * head of the list.
+        */
+       if (old_value == new_agino)
+               return -EFSCORRUPTED;
+
+       agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+       offset = offsetof(struct xfs_agi, agi_unlinked) +
+                       (sizeof(xfs_agino_t) * bucket_index);
+       xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+       return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino,
+       struct xfs_buf          *ibp,
+       struct xfs_dinode       *dip,
+       struct xfs_imap         *imap,
+       xfs_agino_t             next_agino)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       int                     offset;
+
+       ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+       trace_xfs_iunlink_update_dinode(mp, agno, agino,
+                       be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+       dip->di_next_unlinked = cpu_to_be32(next_agino);
+       offset = imap->im_boffset +
+                       offsetof(struct xfs_dinode, di_next_unlinked);
+
+       /* need to recalc the inode CRC if appropriate */
+       xfs_dinode_calc_crc(mp, dip);
+       xfs_trans_inode_buf(tp, ibp);
+       xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+       xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             next_agino,
+       xfs_agino_t             *old_next_agino)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_dinode       *dip;
+       struct xfs_buf          *ibp;
+       xfs_agino_t             old_value;
+       int                     error;
+
+       ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+       if (error)
+               return error;
+
+       /* Make sure the old pointer isn't garbage. */
+       old_value = be32_to_cpu(dip->di_next_unlinked);
+       if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+               error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       /*
+        * Since we're updating a linked list, we should never find that the
+        * current pointer is the same as the new value, unless we're
+        * terminating the list.
+        */
+       *old_next_agino = old_value;
+       if (old_value == next_agino) {
+               if (next_agino != NULLAGINO)
+                       error = -EFSCORRUPTED;
+               goto out;
+       }
+
+       /* Ok, update the new pointer. */
+       xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                       ibp, dip, &ip->i_imap, next_agino);
+       return 0;
+out:
+       xfs_trans_brelse(tp, ibp);
+       return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
  *
  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
  * list when the inode is freed.
  */
 STATIC int
 xfs_iunlink(
-       struct xfs_trans *tp,
-       struct xfs_inode *ip)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip)
 {
-       xfs_mount_t     *mp = tp->t_mountp;
-       xfs_agi_t       *agi;
-       xfs_dinode_t    *dip;
-       xfs_buf_t       *agibp;
-       xfs_buf_t       *ibp;
-       xfs_agino_t     agino;
-       short           bucket_index;
-       int             offset;
-       int             error;
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_agi          *agi;
+       struct xfs_buf          *agibp;
+       xfs_agino_t             next_agino;
+       xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+       short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+       int                     error;
 
+       ASSERT(VFS_I(ip)->i_nlink == 0);
        ASSERT(VFS_I(ip)->i_mode != 0);
+       trace_xfs_iunlink(ip);
 
-       /*
-        * Get the agi buffer first.  It ensures lock ordering
-        * on the list.
-        */
-       error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+       /* Get the agi buffer first.  It ensures lock ordering on the list. */
+       error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
                return error;
        agi = XFS_BUF_TO_AGI(agibp);
 
        /*
-        * Get the index into the agi hash table for the
-        * list this inode will go on.
+        * Get the index into the agi hash table for the list this inode will
+        * go on.  Make sure the pointer isn't garbage and that this inode
+        * isn't already on the list.
         */
-       agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-       ASSERT(agino != 0);
-       bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-       ASSERT(agi->agi_unlinked[bucket_index]);
-       ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+       next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       if (next_agino == agino ||
+           !xfs_verify_agino_or_null(mp, agno, next_agino))
+               return -EFSCORRUPTED;
+
+       if (next_agino != NULLAGINO) {
+               struct xfs_perag        *pag;
+               xfs_agino_t             old_agino;
+
+               /*
+                * There is already another inode in the bucket, so point this
+                * inode to the current head of the list.
+                */
+               error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+                               &old_agino);
+               if (error)
+                       return error;
+               ASSERT(old_agino == NULLAGINO);
 
-       if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
                /*
-                * There is already another inode in the bucket we need
-                * to add ourselves to.  Add us at the front of the list.
-                * Here we put the head pointer into our next pointer,
-                * and then we fall through to point the head at us.
+                * agino has been unlinked, add a backref from the next inode
+                * back to agino.
                 */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
+               pag = xfs_perag_get(mp, agno);
+               error = xfs_iunlink_add_backref(pag, agino, next_agino);
+               xfs_perag_put(pag);
                if (error)
                        return error;
+       }
+
+       /* Point the head of the list to point to this inode. */
+       return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
 
-               ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
-               dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-               offset = ip->i_imap.im_boffset +
-                       offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             agino,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       int                     error;
 
-               /* need to recalc the inode CRC if appropriate */
-               xfs_dinode_calc_crc(mp, dip);
+       imap->im_blkno = 0;
+       error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_imap returned error %d.",
+                               __func__, error);
+               return error;
+       }
 
-               xfs_trans_inode_buf(tp, ibp);
-               xfs_trans_log_buf(tp, ibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-               xfs_inobp_check(mp, ibp);
+       error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+       if (error) {
+               xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+                               __func__, error);
+               return error;
+       }
+
+       return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino.  Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agino_t             head_agino,
+       xfs_agino_t             target_agino,
+       xfs_agino_t             *agino,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp,
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       xfs_agino_t             next_agino;
+       int                     error;
+
+       ASSERT(head_agino != target_agino);
+       *bpp = NULL;
+
+       /* See if our backref cache can find it faster. */
+       *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+       if (*agino != NULLAGINO) {
+               error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+               if (error)
+                       return error;
+
+               if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+                       return 0;
+
+               /*
+                * If we get here the cache contents were corrupt, so drop the
+                * buffer and fall back to walking the bucket list.
+                */
+               xfs_trans_brelse(tp, *bpp);
+               *bpp = NULL;
+               WARN_ON_ONCE(1);
+       }
+
+       trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+       /* Otherwise, walk the entire bucket until we find it. */
+       next_agino = head_agino;
+       while (next_agino != target_agino) {
+               xfs_agino_t     unlinked_agino;
+
+               if (*bpp)
+                       xfs_trans_brelse(tp, *bpp);
+
+               *agino = next_agino;
+               error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+                               bpp);
+               if (error)
+                       return error;
+
+               unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+               /*
+                * Make sure this pointer is valid and isn't an obvious
+                * infinite loop.
+                */
+               if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+                   next_agino == unlinked_agino) {
+                       XFS_CORRUPTION_ERROR(__func__,
+                                       XFS_ERRLEVEL_LOW, mp,
+                                       *dipp, sizeof(**dipp));
+                       error = -EFSCORRUPTED;
+                       return error;
+               }
+               next_agino = unlinked_agino;
        }
 
-       /*
-        * Point the bucket head pointer at the inode being inserted.
-        */
-       ASSERT(agino != 0);
-       agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
-       offset = offsetof(xfs_agi_t, agi_unlinked) +
-               (sizeof(xfs_agino_t) * bucket_index);
-       xfs_trans_log_buf(tp, agibp, offset,
-                         (offset + sizeof(xfs_agino_t) - 1));
        return 0;
 }
 
@@ -1995,181 +2419,106 @@ xfs_iunlink(
  */
 STATIC int
 xfs_iunlink_remove(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip)
 {
-       xfs_ino_t       next_ino;
-       xfs_mount_t     *mp;
-       xfs_agi_t       *agi;
-       xfs_dinode_t    *dip;
-       xfs_buf_t       *agibp;
-       xfs_buf_t       *ibp;
-       xfs_agnumber_t  agno;
-       xfs_agino_t     agino;
-       xfs_agino_t     next_agino;
-       xfs_buf_t       *last_ibp;
-       xfs_dinode_t    *last_dip = NULL;
-       short           bucket_index;
-       int             offset, last_offset = 0;
-       int             error;
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_agi          *agi;
+       struct xfs_buf          *agibp;
+       struct xfs_buf          *last_ibp;
+       struct xfs_dinode       *last_dip = NULL;
+       struct xfs_perag        *pag = NULL;
+       xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+       xfs_agino_t             next_agino;
+       xfs_agino_t             head_agino;
+       short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+       int                     error;
 
-       mp = tp->t_mountp;
-       agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+       trace_xfs_iunlink_remove(ip);
 
-       /*
-        * Get the agi buffer first.  It ensures lock ordering
-        * on the list.
-        */
+       /* Get the agi buffer first.  It ensures lock ordering on the list. */
        error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
                return error;
-
        agi = XFS_BUF_TO_AGI(agibp);
 
        /*
-        * Get the index into the agi hash table for the
-        * list this inode will go on.
+        * Get the index into the agi hash table for the list this inode will
+        * go on.  Make sure the head pointer isn't garbage.
         */
-       agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-       if (!xfs_verify_agino(mp, agno, agino))
-               return -EFSCORRUPTED;
-       bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
-       if (!xfs_verify_agino(mp, agno,
-                       be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+       head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+       if (!xfs_verify_agino(mp, agno, head_agino)) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
                                agi, sizeof(*agi));
                return -EFSCORRUPTED;
        }
 
-       if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
-               /*
-                * We're at the head of the list.  Get the inode's on-disk
-                * buffer to see if there is anyone after us on the list.
-                * Only modify our next pointer if it is not already NULLAGINO.
-                * This saves us the overhead of dealing with the buffer when
-                * there is no need to change it.
-                */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
-               if (error) {
-                       xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
-                               __func__, error);
-                       return error;
-               }
-               next_agino = be32_to_cpu(dip->di_next_unlinked);
-               ASSERT(next_agino != 0);
-               if (next_agino != NULLAGINO) {
-                       dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_imap.im_boffset +
-                               offsetof(xfs_dinode_t, di_next_unlinked);
-
-                       /* need to recalc the inode CRC if appropriate */
-                       xfs_dinode_calc_crc(mp, dip);
-
-                       xfs_trans_inode_buf(tp, ibp);
-                       xfs_trans_log_buf(tp, ibp, offset,
-                                         (offset + sizeof(xfs_agino_t) - 1));
-                       xfs_inobp_check(mp, ibp);
-               } else {
-                       xfs_trans_brelse(tp, ibp);
-               }
-               /*
-                * Point the bucket head pointer at the next inode.
-                */
-               ASSERT(next_agino != 0);
-               ASSERT(next_agino != agino);
-               agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
-               offset = offsetof(xfs_agi_t, agi_unlinked) +
-                       (sizeof(xfs_agino_t) * bucket_index);
-               xfs_trans_log_buf(tp, agibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-       } else {
-               /*
-                * We need to search the list for the inode being freed.
-                */
-               next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
-               last_ibp = NULL;
-               while (next_agino != agino) {
-                       struct xfs_imap imap;
+       /*
+        * Set our inode's next_unlinked pointer to NULL and then return
+        * the old pointer value so that we can update whatever was previous
+        * to us in the list to point to whatever was next in the list.
+        */
+       error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+       if (error)
+               return error;
 
-                       if (last_ibp)
-                               xfs_trans_brelse(tp, last_ibp);
+       /*
+        * If there was a backref pointing from the next inode back to this
+        * one, remove it because we've removed this inode from the list.
+        *
+        * Later, if this inode was in the middle of the list we'll update
+        * this inode's backref to point from the next inode.
+        */
+       if (next_agino != NULLAGINO) {
+               pag = xfs_perag_get(mp, agno);
+               error = xfs_iunlink_change_backref(pag, next_agino,
+                               NULLAGINO);
+               if (error)
+                       goto out;
+       }
 
-                       imap.im_blkno = 0;
-                       next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+       if (head_agino == agino) {
+               /* Point the head of the list to the next unlinked inode. */
+               error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+                               next_agino);
+               if (error)
+                       goto out;
+       } else {
+               struct xfs_imap imap;
+               xfs_agino_t     prev_agino;
 
-                       error = xfs_imap(mp, tp, next_ino, &imap, 0);
-                       if (error) {
-                               xfs_warn(mp,
-       "%s: xfs_imap returned error %d.",
-                                        __func__, error);
-                               return error;
-                       }
+               if (!pag)
+                       pag = xfs_perag_get(mp, agno);
 
-                       error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
-                                              &last_ibp, 0, 0);
-                       if (error) {
-                               xfs_warn(mp,
-       "%s: xfs_imap_to_bp returned error %d.",
-                                       __func__, error);
-                               return error;
-                       }
+               /* We need to search the list for the inode being freed. */
+               error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+                               &prev_agino, &imap, &last_dip, &last_ibp,
+                               pag);
+               if (error)
+                       goto out;
 
-                       last_offset = imap.im_boffset;
-                       next_agino = be32_to_cpu(last_dip->di_next_unlinked);
-                       if (!xfs_verify_agino(mp, agno, next_agino)) {
-                               XFS_CORRUPTION_ERROR(__func__,
-                                               XFS_ERRLEVEL_LOW, mp,
-                                               last_dip, sizeof(*last_dip));
-                               return -EFSCORRUPTED;
-                       }
-               }
+               /* Point the previous inode on the list to the next inode. */
+               xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+                               last_dip, &imap, next_agino);
 
                /*
-                * Now last_ibp points to the buffer previous to us on the
-                * unlinked list.  Pull us from the list.
+                * Now we deal with the backref for this inode.  If this inode
+                * pointed at a real inode, change the backref that pointed to
+                * us to point to our old next.  If this inode was the end of
+                * the list, delete the backref that pointed to us.  Note that
+                * change_backref takes care of deleting the backref if
+                * next_agino is NULLAGINO.
                 */
-               error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
-                                      0, 0);
-               if (error) {
-                       xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
-                               __func__, error);
-                       return error;
-               }
-               next_agino = be32_to_cpu(dip->di_next_unlinked);
-               ASSERT(next_agino != 0);
-               ASSERT(next_agino != agino);
-               if (next_agino != NULLAGINO) {
-                       dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_imap.im_boffset +
-                               offsetof(xfs_dinode_t, di_next_unlinked);
-
-                       /* need to recalc the inode CRC if appropriate */
-                       xfs_dinode_calc_crc(mp, dip);
-
-                       xfs_trans_inode_buf(tp, ibp);
-                       xfs_trans_log_buf(tp, ibp, offset,
-                                         (offset + sizeof(xfs_agino_t) - 1));
-                       xfs_inobp_check(mp, ibp);
-               } else {
-                       xfs_trans_brelse(tp, ibp);
-               }
-               /*
-                * Point the previous inode on the list to the next inode.
-                */
-               last_dip->di_next_unlinked = cpu_to_be32(next_agino);
-               ASSERT(next_agino != 0);
-               offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
-               /* need to recalc the inode CRC if appropriate */
-               xfs_dinode_calc_crc(mp, last_dip);
-
-               xfs_trans_inode_buf(tp, last_ibp);
-               xfs_trans_log_buf(tp, last_ibp, offset,
-                                 (offset + sizeof(xfs_agino_t) - 1));
-               xfs_inobp_check(mp, last_ibp);
+               error = xfs_iunlink_change_backref(pag, agino, next_agino);
+               if (error)
+                       goto out;
        }
-       return 0;
+
+out:
+       if (pag)
+               xfs_perag_put(pag);
+       return error;
 }
 
 /*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
 
        /*
         * Prepare the tmpfile inode as if it were created through the VFS.
-        * Otherwise, the link increment paths will complain about nlink 0->1.
-        * Drop the link count as done by d_tmpfile(), complete the inode setup
-        * and flag it as linkable.
+        * Complete the inode setup and flag it as linkable.  nlink is already
+        * zero, so we can skip the drop_nlink.
         */
-       drop_nlink(VFS_I(tmpfile));
        xfs_setup_iops(tmpfile);
        xfs_finish_inode_setup(tmpfile);
        VFS_I(tmpfile)->i_state |= I_LINKABLE;
index be201452015582e7728aaf66c43e7076502594db..e62074a5257ce3a5e4932743129b7fdedafc6b6a 100644 (file)
@@ -500,4 +500,7 @@ extern struct kmem_zone     *xfs_inode_zone;
 
 bool xfs_inode_verify_forks(struct xfs_inode *ip);
 
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
 #endif /* __XFS_INODE_H__ */
index 27c93b5f029df92b17c22456c0bfe7a5a3fa085c..63d323916bba9e42dc3f37d81359b16a6821784b 100644 (file)
 #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
 
-void
+static int
+xfs_alert_fsblock_zero(
+       xfs_inode_t     *ip,
+       xfs_bmbt_irec_t *imap)
+{
+       xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+                       "Access to block zero in inode %llu "
+                       "start_block: %llx start_off: %llx "
+                       "blkcnt: %llx extent-state: %x",
+               (unsigned long long)ip->i_ino,
+               (unsigned long long)imap->br_startblock,
+               (unsigned long long)imap->br_startoff,
+               (unsigned long long)imap->br_blockcount,
+               imap->br_state);
+       return -EFSCORRUPTED;
+}
+
+int
 xfs_bmbt_to_iomap(
        struct xfs_inode        *ip,
        struct iomap            *iomap,
-       struct xfs_bmbt_irec    *imap)
+       struct xfs_bmbt_irec    *imap,
+       bool                    shared)
 {
        struct xfs_mount        *mp = ip->i_mount;
 
+       if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+               return xfs_alert_fsblock_zero(ip, imap);
+
        if (imap->br_startblock == HOLESTARTBLOCK) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_HOLE;
-       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+       } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+                  isnullstartblock(imap->br_startblock)) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_DELALLOC;
        } else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
        iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
        iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+       if (xfs_ipincount(ip) &&
+           (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+               iomap->flags |= IOMAP_F_DIRTY;
+       if (shared)
+               iomap->flags |= IOMAP_F_SHARED;
+       return 0;
 }
 
 static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
        return 0;
 }
 
-STATIC int
-xfs_alert_fsblock_zero(
-       xfs_inode_t     *ip,
-       xfs_bmbt_irec_t *imap)
-{
-       xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-                       "Access to block zero in inode %llu "
-                       "start_block: %llx start_off: %llx "
-                       "blkcnt: %llx extent-state: %x",
-               (unsigned long long)ip->i_ino,
-               (unsigned long long)imap->br_startblock,
-               (unsigned long long)imap->br_startoff,
-               (unsigned long long)imap->br_blockcount,
-               imap->br_state);
-       return -EFSCORRUPTED;
-}
-
 int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
        struct xfs_inode        *ip,
+       int                     whichfork,
        loff_t                  offset,
        loff_t                  count,
        struct xfs_iext_cursor  *icur)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        struct xfs_bmbt_irec    prev;
        int                     shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        xfs_fileoff_t           maxbytes_fsb =
                XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
        xfs_fileoff_t           end_fsb;
-       int                     error = 0, eof = 0;
-       struct xfs_bmbt_irec    got;
-       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    imap, cmap;
+       struct xfs_iext_cursor  icur, ccur;
        xfs_fsblock_t           prealloc_blocks = 0;
+       bool                    eof = false, cow_eof = false, shared = false;
+       int                     whichfork = XFS_DATA_FORK;
+       int                     error = 0;
 
        ASSERT(!XFS_IS_REALTIME_INODE(ip));
        ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
 
        XFS_STATS_INC(mp, xs_blk_mapw);
 
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
                if (error)
                        goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
 
        end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
-       eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+       /*
+        * Search the data fork fork first to look up our source mapping.  We
+        * always need the data fork map, as we have to return it to the
+        * iomap code so that the higher level write code can read data in to
+        * perform read-modify-write cycles for unaligned writes.
+        */
+       eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
        if (eof)
-               got.br_startoff = end_fsb; /* fake hole until the end */
+               imap.br_startoff = end_fsb; /* fake hole until the end */
+
+       /* We never need to allocate blocks for zeroing a hole. */
+       if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+               xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+               goto out_unlock;
+       }
 
-       if (got.br_startoff <= offset_fsb) {
+       /*
+        * Search the COW fork extent list even if we did not find a data fork
+        * extent.  This serves two purposes: first this implements the
+        * speculative preallocation using cowextsize, so that we also unshare
+        * block adjacent to shared blocks instead of just the shared blocks
+        * themselves.  Second the lookup in the extent list is generally faster
+        * than going out to the shared extent tree.
+        */
+       if (xfs_is_cow_inode(ip)) {
+               if (!ip->i_cowfp) {
+                       ASSERT(!xfs_is_reflink_inode(ip));
+                       xfs_ifork_init_cow(ip);
+               }
+               cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+                               &ccur, &cmap);
+               if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+                       trace_xfs_reflink_cow_found(ip, &cmap);
+                       whichfork = XFS_COW_FORK;
+                       goto done;
+               }
+       }
+
+       if (imap.br_startoff <= offset_fsb) {
                /*
                 * For reflink files we may need a delalloc reservation when
                 * overwriting shared extents.   This includes zeroing of
                 * existing extents that contain data.
                 */
-               if (xfs_is_reflink_inode(ip) &&
-                   ((flags & IOMAP_WRITE) ||
-                    got.br_state != XFS_EXT_UNWRITTEN)) {
-                       xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-                       error = xfs_reflink_reserve_cow(ip, &got);
-                       if (error)
-                               goto out_unlock;
+               if (!xfs_is_cow_inode(ip) ||
+                   ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto done;
                }
 
-               trace_xfs_iomap_found(ip, offset, count, 0, &got);
-               goto done;
-       }
+               xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 
-       if (flags & IOMAP_ZERO) {
-               xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
-               goto out_unlock;
+               /* Trim the mapping to the nearest shared extent boundary. */
+               error = xfs_inode_need_cow(ip, &imap, &shared);
+               if (error)
+                       goto out_unlock;
+
+               /* Not shared?  Just report the (potentially capped) extent. */
+               if (!shared) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto done;
+               }
+
+               /*
+                * Fork all the shared blocks from our write offset until the
+                * end of the extent.
+                */
+               whichfork = XFS_COW_FORK;
+               end_fsb = imap.br_startoff + imap.br_blockcount;
+       } else {
+               /*
+                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                * pages to keep the chunks of work done where somewhat
+                * symmetric with the work writeback does.  This is a completely
+                * arbitrary number pulled out of thin air.
+                *
+                * Note that the values needs to be less than 32-bits wide until
+                * the lower level functions are updated.
+                */
+               count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+               end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+               if (xfs_is_always_cow_inode(ip))
+                       whichfork = XFS_COW_FORK;
        }
 
        error = xfs_qm_dqattach_locked(ip, false);
        if (error)
                goto out_unlock;
 
-       /*
-        * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
-        * to keep the chunks of work done where somewhat symmetric with the
-        * work writeback does. This is a completely arbitrary number pulled
-        * out of thin air as a best guess for initial testing.
-        *
-        * Note that the values needs to be less than 32-bits wide until
-        * the lower level functions are updated.
-        */
-       count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-       end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
        if (eof) {
-               prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
-                               &icur);
+               prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+                               count, &icur);
                if (prealloc_blocks) {
                        xfs_extlen_t    align;
                        xfs_off_t       end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
        }
 
 retry:
-       error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-                       end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
-                       eof);
+       error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+                       end_fsb - offset_fsb, prealloc_blocks,
+                       whichfork == XFS_DATA_FORK ? &imap : &cmap,
+                       whichfork == XFS_DATA_FORK ? &icur : &ccur,
+                       whichfork == XFS_DATA_FORK ? eof : cow_eof);
        switch (error) {
        case 0:
                break;
@@ -647,186 +711,22 @@ retry:
         * them out if the write happens to fail.
         */
        iomap->flags |= IOMAP_F_NEW;
-       trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+       trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+                       whichfork == XFS_DATA_FORK ? &imap : &cmap);
 done:
-       if (isnullstartblock(got.br_startblock))
-               got.br_startblock = DELAYSTARTBLOCK;
-
-       if (!got.br_startblock) {
-               error = xfs_alert_fsblock_zero(ip, &got);
-               if (error)
+       if (whichfork == XFS_COW_FORK) {
+               if (imap.br_startoff > offset_fsb) {
+                       xfs_trim_extent(&cmap, offset_fsb,
+                                       imap.br_startoff - offset_fsb);
+                       error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
                        goto out_unlock;
-       }
-
-       xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
-       xfs_inode_t     *ip,
-       int             whichfork,
-       xfs_off_t       offset,
-       xfs_bmbt_irec_t *imap,
-       unsigned int    *cow_seq)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
-       xfs_fileoff_t   offset_fsb, last_block;
-       xfs_fileoff_t   end_fsb, map_start_fsb;
-       xfs_filblks_t   count_fsb;
-       xfs_trans_t     *tp;
-       int             nimaps;
-       int             error = 0;
-       int             flags = XFS_BMAPI_DELALLOC;
-       int             nres;
-
-       if (whichfork == XFS_COW_FORK)
-               flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
-       /*
-        * Make sure that the dquots are there.
-        */
-       error = xfs_qm_dqattach(ip);
-       if (error)
-               return error;
-
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       count_fsb = imap->br_blockcount;
-       map_start_fsb = imap->br_startoff;
-
-       XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
-       while (count_fsb != 0) {
-               /*
-                * Set up a transaction with which to allocate the
-                * backing store for the file.  Do allocations in a
-                * loop until we get some space in the range we are
-                * interested in.  The other space that might be allocated
-                * is in the delayed allocation extent on which we sit
-                * but before our buffer starts.
-                */
-               nimaps = 0;
-               while (nimaps == 0) {
-                       nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       /*
-                        * We have already reserved space for the extent and any
-                        * indirect blocks when creating the delalloc extent,
-                        * there is no need to reserve space in this transaction
-                        * again.
-                        */
-                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
-                                       0, XFS_TRANS_RESERVE, &tp);
-                       if (error)
-                               return error;
-
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ijoin(tp, ip, 0);
-
-                       /*
-                        * it is possible that the extents have changed since
-                        * we did the read call as we dropped the ilock for a
-                        * while. We have to be careful about truncates or hole
-                        * punchs here - we are not allowed to allocate
-                        * non-delalloc blocks here.
-                        *
-                        * The only protection against truncation is the pages
-                        * for the range we are being asked to convert are
-                        * locked and hence a truncate will block on them
-                        * first.
-                        *
-                        * As a result, if we go beyond the range we really
-                        * need and hit an delalloc extent boundary followed by
-                        * a hole while we have excess blocks in the map, we
-                        * will fill the hole incorrectly and overrun the
-                        * transaction reservation.
-                        *
-                        * Using a single map prevents this as we are forced to
-                        * check each map we look for overlap with the desired
-                        * range and abort as soon as we find it. Also, given
-                        * that we only return a single map, having one beyond
-                        * what we can return is probably a bit silly.
-                        *
-                        * We also need to check that we don't go beyond EOF;
-                        * this is a truncate optimisation as a truncate sets
-                        * the new file size before block on the pages we
-                        * currently have locked under writeback. Because they
-                        * are about to be tossed, we don't need to write them
-                        * back....
-                        */
-                       nimaps = 1;
-                       end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-                       error = xfs_bmap_last_offset(ip, &last_block,
-                                                       XFS_DATA_FORK);
-                       if (error)
-                               goto trans_cancel;
-
-                       last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
-                       if ((map_start_fsb + count_fsb) > last_block) {
-                               count_fsb = last_block - map_start_fsb;
-                               if (count_fsb == 0) {
-                                       error = -EAGAIN;
-                                       goto trans_cancel;
-                               }
-                       }
-
-                       /*
-                        * From this point onwards we overwrite the imap
-                        * pointer that the caller gave to us.
-                        */
-                       error = xfs_bmapi_write(tp, ip, map_start_fsb,
-                                               count_fsb, flags, nres, imap,
-                                               &nimaps);
-                       if (error)
-                               goto trans_cancel;
-
-                       error = xfs_trans_commit(tp);
-                       if (error)
-                               goto error0;
-
-                       if (whichfork == XFS_COW_FORK)
-                               *cow_seq = READ_ONCE(ifp->if_seq);
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-
-               /*
-                * See if we were able to allocate an extent that
-                * covers at least part of the callers request
-                */
-               if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                       return xfs_alert_fsblock_zero(ip, imap);
-
-               if ((offset_fsb >= imap->br_startoff) &&
-                   (offset_fsb < (imap->br_startoff +
-                                  imap->br_blockcount))) {
-                       XFS_STATS_INC(mp, xs_xstrat_quick);
-                       return 0;
                }
-
-               /*
-                * So far we have not mapped the requested part of the
-                * file, just surrounding data, try again.
-                */
-               count_fsb -= imap->br_blockcount;
-               map_start_fsb = imap->br_startoff + imap->br_blockcount;
+               /* ensure we only report blocks we have a reservation for */
+               xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+               shared = true;
        }
-
-trans_cancel:
-       xfs_trans_cancel(tp);
-error0:
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
         * COW writes may allocate delalloc space or convert unwritten COW
         * extents, so we need to make sure to take the lock exclusively here.
         */
-       if (xfs_is_reflink_inode(ip) && is_write) {
+       if (xfs_is_cow_inode(ip) && is_write) {
                /*
                 * FIXME: It could still overwrite on unshared extents and not
                 * need allocation.
@@ -1009,7 +909,7 @@ relock:
         * check, so if we got ILOCK_SHARED for a write and but we're now a
         * reflink inode we have to switch to ILOCK_EXCL and relock.
         */
-       if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+       if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
                xfs_iunlock(ip, mode);
                mode = XFS_ILOCK_EXCL;
                goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
         * Break shared extents if necessary. Checks for non-blocking IO have
         * been done up front, so we don't need to do them here.
         */
-       if (xfs_is_reflink_inode(ip)) {
+       if (xfs_is_cow_inode(ip)) {
+               struct xfs_bmbt_irec    cmap;
+               bool                    directio = (flags & IOMAP_DIRECT);
+
                /* if zeroing doesn't need COW allocation, then we are done. */
                if ((flags & IOMAP_ZERO) &&
                    !needs_cow_for_zeroing(&imap, nimaps))
                        goto out_found;
 
-               if (flags & IOMAP_DIRECT) {
-                       /* may drop and re-acquire the ilock */
-                       error = xfs_reflink_allocate_cow(ip, &imap, &shared,
-                                       &lockmode);
-                       if (error)
-                               goto out_unlock;
-               } else {
-                       error = xfs_reflink_reserve_cow(ip, &imap);
-                       if (error)
-                               goto out_unlock;
-               }
+               /* may drop and re-acquire the ilock */
+               cmap = imap;
+               error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+                               directio);
+               if (error)
+                       goto out_unlock;
+
+               /*
+                * For buffered writes we need to report the address of the
+                * previous block (if there was any) so that the higher level
+                * write code can perform read-modify-write operations; we
+                * won't need the CoW fork mapping until writeback.  For direct
+                * I/O, which must be block aligned, we need to report the
+                * newly allocated address.  If the data fork has a hole, copy
+                * the COW fork mapping to avoid allocating to the data fork.
+                */
+               if (directio || imap.br_startblock == HOLESTARTBLOCK)
+                       imap = cmap;
 
                end_fsb = imap.br_startoff + imap.br_blockcount;
                length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
                return error;
 
        iomap->flags |= IOMAP_F_NEW;
-       trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+       trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 
 out_finish:
-       if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
-                               & ~XFS_ILOG_TIMESTAMP))
-               iomap->flags |= IOMAP_F_DIRTY;
-
-       xfs_bmbt_to_iomap(ip, iomap, &imap);
-
-       if (shared)
-               iomap->flags |= IOMAP_F_SHARED;
-       return 0;
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
 
 out_found:
        ASSERT(nimaps);
        xfs_iunlock(ip, lockmode);
-       trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
        goto out_finish;
 
 out_unlock:
@@ -1240,6 +1142,92 @@ const struct iomap_ops xfs_iomap_ops = {
        .iomap_end              = xfs_file_iomap_end,
 };
 
+static int
+xfs_seek_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+       xfs_fileoff_t           cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    imap, cmap;
+       int                     error = 0;
+       unsigned                lockmode;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       lockmode = xfs_ilock_data_map_shared(ip);
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               if (error)
+                       goto out_unlock;
+       }
+
+       if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+               /*
+                * If we found a data extent we are done.
+                */
+               if (imap.br_startoff <= offset_fsb)
+                       goto done;
+               data_fsb = imap.br_startoff;
+       } else {
+               /*
+                * Fake a hole until the end of the file.
+                */
+               data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+                              XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+       }
+
+       /*
+        * If a COW fork extent covers the hole, report it - capped to the next
+        * data fork extent:
+        */
+       if (xfs_inode_has_cow_data(ip) &&
+           xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+               cow_fsb = cmap.br_startoff;
+       if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+               if (data_fsb < cow_fsb + cmap.br_blockcount)
+                       end_fsb = min(end_fsb, data_fsb);
+               xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+               /*
+                * This is a COW extent, so we must probe the page cache
+                * because there could be dirty page cache being backed
+                * by this extent.
+                */
+               iomap->type = IOMAP_UNWRITTEN;
+               goto out_unlock;
+       }
+
+       /*
+        * Else report a hole, capped to the next found data or COW extent.
+        */
+       if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+               imap.br_blockcount = cow_fsb - offset_fsb;
+       else
+               imap.br_blockcount = data_fsb - offset_fsb;
+       imap.br_startoff = offset_fsb;
+       imap.br_startblock = HOLESTARTBLOCK;
+       imap.br_state = XFS_EXT_NORM;
+done:
+       xfs_trim_extent(&imap, offset_fsb, end_fsb);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+       .iomap_begin            = xfs_seek_iomap_begin,
+};
+
 static int
 xfs_xattr_iomap_begin(
        struct inode            *inode,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
 out_unlock:
        xfs_iunlock(ip, lockmode);
 
-       if (!error) {
-               ASSERT(nimaps);
-               xfs_bmbt_to_iomap(ip, iomap, &imap);
-       }
-
-       return error;
+       if (error)
+               return error;
+       ASSERT(nimaps);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
index c6170548831bec4da8b7cea7a1847e60dcdc7e82..5c2f6aa6d78ffa810bdaeae1ed06cb85f465d1b6 100644 (file)
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
                        struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
-                       struct xfs_bmbt_irec *, unsigned int *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
-               struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+               struct xfs_bmbt_irec *, bool shared);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
 }
 
 extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/
index f48ffd7a8d3e491d76defe66961194a635276115..74047bd0c1aeb44709ceae3ef779921778c4be0e 100644 (file)
@@ -191,9 +191,18 @@ xfs_generic_create(
 
        xfs_setup_iops(ip);
 
-       if (tmpfile)
+       if (tmpfile) {
+               /*
+                * The VFS requires that any inode fed to d_tmpfile must have
+                * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+                * However, we created the temp file with nlink == 0 because
+                * we're not allowed to put an inode with nlink > 0 on the
+                * unlinked list.  Therefore we have to set nlink to 1 so that
+                * d_tmpfile can immediately set it back to zero.
+                */
+               set_nlink(inode, 1);
                d_tmpfile(dentry, inode);
-       else
+       else
                d_instantiate(dentry, inode);
 
        xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
                }
        }
 
+       /*
+        * Note: If you add another clause to set an attribute flag, please
+        * update attributes_mask below.
+        */
        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
        if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
                stat->attributes |= STATX_ATTR_NODUMP;
 
+       stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+                                 STATX_ATTR_APPEND |
+                                 STATX_ATTR_NODUMP);
+
        switch (inode->i_mode & S_IFMT) {
        case S_IFBLK:
        case S_IFCHR:
index 9fe88d125f0a2b78b9c433618df44be9fcecfa98..3371d1ff27c444d1a0eecfe26b519d6514682fa2 100644 (file)
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
        case XFS_BLFT_BTREE_BUF:
                switch (magic32) {
                case XFS_ABTB_CRC_MAGIC:
-               case XFS_ABTC_CRC_MAGIC:
                case XFS_ABTB_MAGIC:
+                       bp->b_ops = &xfs_bnobt_buf_ops;
+                       break;
+               case XFS_ABTC_CRC_MAGIC:
                case XFS_ABTC_MAGIC:
-                       bp->b_ops = &xfs_allocbt_buf_ops;
+                       bp->b_ops = &xfs_cntbt_buf_ops;
                        break;
                case XFS_IBT_CRC_MAGIC:
-               case XFS_FIBT_CRC_MAGIC:
                case XFS_IBT_MAGIC:
-               case XFS_FIBT_MAGIC:
                        bp->b_ops = &xfs_inobt_buf_ops;
                        break;
+               case XFS_FIBT_CRC_MAGIC:
+               case XFS_FIBT_MAGIC:
+                       bp->b_ops = &xfs_finobt_buf_ops;
+                       break;
                case XFS_BMAP_CRC_MAGIC:
                case XFS_BMAP_MAGIC:
                        bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-       if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+       if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
                xfs_alert(mp,
        "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
                        __func__, dip, bp, in_f->ilf_ino);
index b4d8c318be3cef2c8c7750942f87883b4d5a00ba..fd63b0b1307c502ab45db45b37dd5da4cfa1d7f7 100644 (file)
@@ -149,6 +149,7 @@ xfs_free_perag(
                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
+               xfs_iunlink_destroy(pag);
                xfs_buf_hash_destroy(pag);
                mutex_destroy(&pag->pag_ici_reclaim_lock);
                call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
                /* first new pag is fully initialized */
                if (first_initialised == NULLAGNUMBER)
                        first_initialised = index;
+               error = xfs_iunlink_init(pag);
+               if (error)
+                       goto out_hash_destroy;
        }
 
        index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
                if (!pag)
                        break;
                xfs_buf_hash_destroy(pag);
+               xfs_iunlink_destroy(pag);
                mutex_destroy(&pag->pag_ici_reclaim_lock);
                kmem_free(pag);
        }
index 7daafe064af84daeec805b971ad1705cee67653a..110f927cf943dbc9cdb9858c37cb51ad92c7d07d 100644 (file)
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
        struct mutex            m_growlock;     /* growfs mutex */
        int                     m_fixedfsid[2]; /* unchanged for life of FS */
        uint64_t                m_flags;        /* global mount flags */
-       bool                    m_inotbt_nores; /* no per-AG finobt resv. */
+       bool                    m_finobt_nores; /* no per-AG finobt resv. */
        int                     m_ialloc_inos;  /* inodes in inode allocation */
        int                     m_ialloc_blks;  /* blocks in inode allocation */
        int                     m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
         */
        uint32_t                m_generation;
 
+       bool                    m_always_cow;
        bool                    m_fail_unmount;
 #ifdef DEBUG
        /*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
 
        /* reference count */
        uint8_t                 pagf_refcount_level;
+
+       /*
+        * Unlinked inode information.  This incore information reflects
+        * data stored in the AGI, so callers must hold the AGI buffer lock
+        * or have some other means to control concurrency.
+        */
+       struct rhashtable       pagi_unlinked_hash;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
index d3e04d20d8d45007279ee85ab5b04833d3f9355a..c8ba98fae30aefa7013ebfa168fed652a955f3e7 100644 (file)
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
        XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
        XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,          16);
+
+       /*
+        * The v5 superblock format extended several v4 header structures with
+        * additional data. While new fields are only accessible on v5
+        * superblocks, it's important that the v5 structures place original v4
+        * fields/headers in the correct location on-disk. For example, we must
+        * be able to find magic values at the same location in certain blocks
+        * regardless of superblock version.
+        *
+        * The following checks ensure that various v5 data structures place the
+        * subset of v4 metadata associated with the same type of block at the
+        * start of the on-disk block. If there is no data structure definition
+        * for certain types of v4 blocks, traverse down to the first field of
+        * common metadata (e.g., magic value) and make sure it is at offset
+        * zero.
+        */
+       XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr,    0);
+       XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr,  0);
+       XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic,   0);
+       XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic,   0);
+       XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
 }
 
 #endif /* __XFS_ONDISK_H */
index f44c3599527d07441fc6eb689c9d442e29add600..bde2c9f56a46ab883fdfd5cb932d958838c867eb 100644 (file)
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-       xfs_bmbt_to_iomap(ip, iomap, &imap);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
        *device_generation = mp->m_generation;
        return error;
 out_unlock:
index c5b4fa004ca4fd6ac3d25b7926284b67e7cf9282..680ae7662a78ef260fd4897b244b69898a239c5a 100644 (file)
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
        int                     error = 0;
 
        /* Holes, unwritten, and delalloc extents cannot be shared */
-       if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+       if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
                *shared = false;
                return 0;
        }
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
        }
 }
 
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
        struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *imap)
+       struct xfs_bmbt_irec    *imap,
+       bool                    *shared)
 {
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-       struct xfs_bmbt_irec    got;
-       int                     error = 0;
-       bool                    eof = false;
-       struct xfs_iext_cursor  icur;
-       bool                    shared;
-
-       /*
-        * Search the COW fork extent list first.  This serves two purposes:
-        * first this implement the speculative preallocation using cowextisze,
-        * so that we also unshared block adjacent to shared blocks instead
-        * of just the shared blocks themselves.  Second the lookup in the
-        * extent list is generally faster than going out to the shared extent
-        * tree.
-        */
-
-       if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
-               eof = true;
-       if (!eof && got.br_startoff <= imap->br_startoff) {
-               trace_xfs_reflink_cow_found(ip, imap);
-               xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+       /* We can't update any real extents in always COW mode. */
+       if (xfs_is_always_cow_inode(ip) &&
+           !isnullstartblock(imap->br_startblock)) {
+               *shared = true;
                return 0;
        }
 
        /* Trim the mapping to the nearest shared extent boundary. */
-       error = xfs_reflink_trim_around_shared(ip, imap, &shared);
-       if (error)
-               return error;
-
-       /* Not shared?  Just report the (potentially capped) extent. */
-       if (!shared)
-               return 0;
-
-       /*
-        * Fork all the shared blocks from our write offset until the end of
-        * the extent.
-        */
-       error = xfs_qm_dqattach_locked(ip, false);
-       if (error)
-               return error;
-
-       error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-                       imap->br_blockcount, 0, &got, &icur, eof);
-       if (error == -ENOSPC || error == -EDQUOT)
-               trace_xfs_reflink_cow_enospc(ip, imap);
-       if (error)
-               return error;
-
-       xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-       trace_xfs_reflink_cow_alloc(ip, &got);
-       return 0;
+       return xfs_reflink_trim_around_shared(ip, imap, shared);
 }
 
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
-       struct xfs_inode                *ip,
-       struct xfs_bmbt_irec            *imap,
-       xfs_fileoff_t                   offset_fsb,
-       xfs_filblks_t                   count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           offset_fsb,
+       xfs_filblks_t           count_fsb)
 {
-       int                             nimaps = 1;
+       struct xfs_iext_cursor  icur;
+       struct xfs_bmbt_irec    got;
+       struct xfs_btree_cur    *dummy_cur = NULL;
+       int                     dummy_logflags;
+       int                     error = 0;
 
-       if (imap->br_state == XFS_EXT_NORM)
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
                return 0;
 
-       xfs_trim_extent(imap, offset_fsb, count_fsb);
-       trace_xfs_reflink_convert_cow(ip, imap);
-       if (imap->br_blockcount == 0)
-               return 0;
-       return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
-                       XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
-                       &nimaps);
+       do {
+               if (got.br_startoff >= offset_fsb + count_fsb)
+                       break;
+               if (got.br_state == XFS_EXT_NORM)
+                       continue;
+               if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+                       return -EIO;
+
+               xfs_trim_extent(&got, offset_fsb, count_fsb);
+               if (!got.br_blockcount)
+                       continue;
+
+               got.br_state = XFS_EXT_NORM;
+               error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+                               XFS_COW_FORK, &icur, &dummy_cur, &got,
+                               &dummy_logflags);
+               if (error)
+                       return error;
+       } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+       return error;
 }
 
 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
        xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
        xfs_filblks_t           count_fsb = end_fsb - offset_fsb;
-       struct xfs_bmbt_irec    imap;
-       int                     nimaps = 1, error = 0;
+       int                     error;
 
        ASSERT(count != 0);
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
-                       XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
-                       XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+       error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
        if (got.br_startoff > offset_fsb) {
                xfs_trim_extent(imap, imap->br_startoff,
                                got.br_startoff - imap->br_startoff);
-               return xfs_reflink_trim_around_shared(ip, imap, shared);
+               return xfs_inode_need_cow(ip, imap, shared);
        }
 
        *shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *imap,
        bool                    *shared,
-       uint                    *lockmode)
+       uint                    *lockmode,
+       bool                    convert_now)
 {
        struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
        xfs_extlen_t            resblks = 0;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(xfs_is_reflink_inode(ip));
+       if (!ip->i_cowfp) {
+               ASSERT(!xfs_is_reflink_inode(ip));
+               xfs_ifork_init_cow(ip);
+       }
 
        error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
        if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
        if (nimaps == 0)
                return -ENOSPC;
 convert:
-       return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+       xfs_trim_extent(imap, offset_fsb, count_fsb);
+       /*
+        * COW fork extents are supposed to remain unwritten until we're ready
+        * to initiate a disk write.  For direct I/O we are going to write the
+        * data and need the conversion, but for buffered writes we're done.
+        */
+       if (!convert_now || imap->br_state == XFS_EXT_NORM)
+               return 0;
+       trace_xfs_reflink_convert_cow(ip, imap);
+       return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
 
 out_unreserve:
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
        int                     error;
 
        trace_xfs_reflink_cancel_cow_range(ip, offset, count);
-       ASSERT(xfs_is_reflink_inode(ip));
+       ASSERT(ip->i_cowfp);
 
        offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
        if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
                        break;
                ASSERT(nimaps == 1);
 
-               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
                                &imap);
 
                /* Translate imap into the destination file. */
index 6d73daef1f132398d0b2ee02ed319c067bd02b6f..28a43b7f581d00fa5623e2b5f0b0730ddb6b0e1e 100644 (file)
@@ -6,16 +6,28 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+       return ip->i_mount->m_always_cow &&
+               xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+       return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
 extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
                xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
                xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
                struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+               bool *shared);
 
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+               struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+               bool convert_now);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
 
index c9097cb0b955ea9219185a97551808befc7da495..f093ea244849eb96d31eeaea0f292796f893c778 100644 (file)
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
        INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
        INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
        mp->m_kobj.kobject.kset = xfs_kset;
+       /*
+        * We don't create the finobt per-ag space reservation until after log
+        * recovery, so we must set this to true so that an ifree transaction
+        * started during log recovery will not depend on space reservations
+        * for finobt expansion.
+        */
+       mp->m_finobt_nores = true;
        return mp;
 }
 
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
                }
        }
 
-       if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
-               xfs_alert(mp,
+       if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+               if (mp->m_sb.sb_rblocks) {
+                       xfs_alert(mp,
        "reflink not compatible with realtime device!");
-               error = -EINVAL;
-               goto out_filestream_unmount;
+                       error = -EINVAL;
+                       goto out_filestream_unmount;
+               }
+
+               if (xfs_globals.always_cow) {
+                       xfs_info(mp, "using DEBUG-only always_cow mode.");
+                       mp->m_always_cow = true;
+               }
        }
 
        if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
index 168488130a1906e34e1181da624891e20b73436a..ad7f9be130872c9e0664780115ba496bfd49e885 100644 (file)
@@ -85,6 +85,7 @@ struct xfs_globals {
        int     log_recovery_delay;     /* log recovery delay (secs) */
        int     mount_delay;            /* mount setup delay (secs) */
        bool    bug_on_assert;          /* BUG() the kernel on assert failure */
+       bool    always_cow;             /* use COW fork for all overwrites */
 };
 extern struct xfs_globals      xfs_globals;
 
index cd6a994a72500ac48755549db33d5510e3c36fdc..cabda13f3c64168a7a33d01e37bf895f9e4a07a4 100644 (file)
@@ -183,10 +183,34 @@ mount_delay_show(
 }
 XFS_SYSFS_ATTR_RW(mount_delay);
 
+static ssize_t
+always_cow_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       ssize_t         ret;
+
+       ret = kstrtobool(buf, &xfs_globals.always_cow);
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+static ssize_t
+always_cow_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
 static struct attribute *xfs_dbg_attrs[] = {
        ATTR_LIST(bug_on_assert),
        ATTR_LIST(log_recovery_delay),
        ATTR_LIST(mount_delay),
+       ATTR_LIST(always_cow),
        NULL,
 };
 
index 6fcc893dfc91358e7da174e4bf4126112cb2a1e2..47fb07d86efdc05c705ea1379abf1eb259500d99 100644 (file)
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name,  \
 DEFINE_READPAGE_EVENT(xfs_vm_readpage);
 DEFINE_READPAGE_EVENT(xfs_vm_readpages);
 
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
 DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                int type, struct xfs_bmbt_irec *irec),
-       TP_ARGS(ip, offset, count, type, irec),
+                int whichfork, struct xfs_bmbt_irec *irec),
+       TP_ARGS(ip, offset, count, whichfork, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
-               __field(int, type)
+               __field(int, whichfork)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->size = ip->i_d.di_size;
                __entry->offset = offset;
                __entry->count = count;
-               __entry->type = type;
+               __entry->whichfork = whichfork;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
-                 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+                 "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->count,
-                 __print_symbolic(__entry->type, XFS_IO_TYPES),
+                 __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
                  __entry->startoff,
                  (int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 
-#define DEFINE_IOMAP_EVENT(name)       \
+#define DEFINE_IMAP_EVENT(name)        \
 DEFINE_EVENT(xfs_imap_class, name,     \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                int type, struct xfs_bmbt_irec *irec),         \
-       TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+                int whichfork, struct xfs_bmbt_irec *irec),            \
+       TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
 DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
 DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
 DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
 TRACE_EVENT(xfs_reflink_remap_blocks_loop,
        TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
                 xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
 DEFINE_TRANS_EVENT(xfs_trans_add_item);
 DEFINE_TRANS_EVENT(xfs_trans_free_items);
 
+TRACE_EVENT(xfs_iunlink_update_bucket,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+                xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+       TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(unsigned int, bucket)
+               __field(xfs_agino_t, old_ptr)
+               __field(xfs_agino_t, new_ptr)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->bucket = bucket;
+               __entry->old_ptr = old_ptr;
+               __entry->new_ptr = new_ptr;
+       ),
+       TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->bucket,
+                 __entry->old_ptr,
+                 __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+       TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(xfs_agino_t, old_ptr)
+               __field(xfs_agino_t, new_ptr)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->old_ptr = old_ptr;
+               __entry->new_ptr = new_ptr;
+       ),
+       TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agino,
+                 __entry->old_ptr,
+                 __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+       TP_PROTO(struct xfs_inode *ip),
+       TP_ARGS(ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+               __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+       ),
+       TP_printk("dev %d:%d agno %u agino %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+       TP_PROTO(struct xfs_inode *ip), \
+       TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
index 11cff449d055ce2c943046616bbc0bd96231980f..e1c7d55b32c37b1db0b15c131937eda73c89396d 100644 (file)
@@ -17,7 +17,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_inode.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate a "bmap update done"
index 629f1479c9d234492d3a7d431dd21bb30f4db393..7d65ebf1e847a9c07c0fbb8178b26892ad3390e2 100644 (file)
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
                 * release this buffer when it kills the tranaction.
                 */
                ASSERT(bp->b_ops != NULL);
-               error = xfs_buf_ensure_ops(bp, ops);
+               error = xfs_buf_reverify(bp, ops);
                if (error) {
                        xfs_buf_ioerror_alert(bp, __func__);
 
index 0710434eb24004db47b3b45863096b5ad7d1ed04..8ee7a3f8bb20bca0504adcda1eb0776bf4abce42 100644 (file)
@@ -18,7 +18,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate an "extent free done"
index 6c947ff4faf6ecb022fcb0aad0496226372eb977..8d734728dd1be9f6de8662e79b118692802bb12a 100644 (file)
@@ -16,7 +16,6 @@
 #include "xfs_refcount_item.h"
 #include "xfs_alloc.h"
 #include "xfs_refcount.h"
-#include "xfs_defer.h"
 
 /*
  * This routine is called to allocate a "refcount update done"
index a42890931ecd4a2690d3e52eb52e9843e63717d8..5c7936b1be13d3c3f52663438d522faebdec60ca 100644 (file)
@@ -16,7 +16,6 @@
 #include "xfs_rmap_item.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
-#include "xfs_defer.h"
 
 /* Set the map extent flags for this reverse mapping. */
 static void
index 63ee1d5bf1d77a33d7f760f0f10d722266488cb1..9a63016009a1394f41beaff8323a5568b6ceab22 100644 (file)
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
        char *offset;
        int arraytop;
 
+       if (context->count < 0 || context->seen_enough)
+               return;
+
        if (!context->alist)
                goto compute_size;
 
index a625c29a2ea2aa3047cbb22ac919d82446d88aaa..1e69d9fe16da6542b02290743ebcb83ce239ef77 100644 (file)
@@ -25,6 +25,7 @@
 
 #include <linux/sched.h>
 #include <linux/ptrace.h>
+#include <linux/namei.h>  /* LOOKUP_* */
 #include <uapi/linux/audit.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
@@ -159,6 +160,18 @@ extern int             audit_update_lsm_rules(void);
 extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
 extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
+extern int audit_set_loginuid(kuid_t loginuid);
+
+static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
+{
+       return tsk->loginuid;
+}
+
+static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
+{
+       return tsk->sessionid;
+}
+
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
 static inline __printf(4, 5)
@@ -201,6 +214,17 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
 }
 static inline void audit_log_task_info(struct audit_buffer *ab)
 { }
+
+static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
+{
+       return INVALID_UID;
+}
+
+static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
+{
+       return AUDIT_SID_UNSET;
+}
+
 #define audit_enabled AUDIT_OFF
 #endif /* CONFIG_AUDIT */
 
@@ -225,6 +249,7 @@ extern void __audit_getname(struct filename *name);
 
 #define AUDIT_INODE_PARENT     1       /* dentry represents the parent */
 #define AUDIT_INODE_HIDDEN     2       /* audit record should be hidden */
+#define AUDIT_INODE_NOEVAL     4       /* audit record incomplete */
 extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
 extern void __audit_file(const struct file *);
@@ -285,12 +310,15 @@ static inline void audit_getname(struct filename *name)
 }
 static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
-                               unsigned int parent) {
+                               unsigned int flags) {
        if (unlikely(!audit_dummy_context())) {
-               unsigned int flags = 0;
-               if (parent)
-                       flags |= AUDIT_INODE_PARENT;
-               __audit_inode(name, dentry, flags);
+               unsigned int aflags = 0;
+
+               if (flags & LOOKUP_PARENT)
+                       aflags |= AUDIT_INODE_PARENT;
+               if (flags & LOOKUP_NO_EVAL)
+                       aflags |= AUDIT_INODE_NOEVAL;
+               __audit_inode(name, dentry, aflags);
        }
 }
 static inline void audit_file(struct file *file)
@@ -320,21 +348,6 @@ static inline void audit_ptrace(struct task_struct *t)
 }
 
                                /* Private API (for audit.c only) */
-extern unsigned int audit_serial(void);
-extern int auditsc_get_stamp(struct audit_context *ctx,
-                             struct timespec64 *t, unsigned int *serial);
-extern int audit_set_loginuid(kuid_t loginuid);
-
-static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
-{
-       return tsk->loginuid;
-}
-
-static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
-{
-       return tsk->sessionid;
-}
-
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -514,19 +527,6 @@ static inline void audit_seccomp(unsigned long syscall, long signr, int code)
 static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
 { }
-static inline int auditsc_get_stamp(struct audit_context *ctx,
-                             struct timespec64 *t, unsigned int *serial)
-{
-       return 0;
-}
-static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
-{
-       return INVALID_UID;
-}
-static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
-{
-       return AUDIT_SID_UNSET;
-}
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
index f640dcbc880c0c291cdbd2a5780f4fd21be4b7b3..ecce0f43c73acde0bfce88d459d838aa12398d9b 100644 (file)
@@ -14,7 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include <uapi/linux/capability.h>
-
+#include <linux/uidgid.h>
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
 #define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
@@ -25,11 +25,12 @@ typedef struct kernel_cap_struct {
        __u32 cap[_KERNEL_CAPABILITY_U32S];
 } kernel_cap_t;
 
-/* exact same as vfs_cap_data but in cpu endian and always filled completely */
+/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
 struct cpu_vfs_cap_data {
        __u32 magic_etc;
        kernel_cap_t permitted;
        kernel_cap_t inheritable;
+       kuid_t rootid;
 };
 
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
@@ -209,6 +210,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
+extern bool ns_capable_setid(struct user_namespace *ns, int cap);
 #else
 static inline bool has_capability(struct task_struct *t, int cap)
 {
@@ -240,6 +242,10 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 {
        return true;
 }
+static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+       return true;
+}
 #endif /* CONFIG_MULTIUSER */
 extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
index aad3babef007c37345b9c56bbfd4b238114bc266..1c70803e9f77056873e18aad6e1f3ce7195a25a1 100644 (file)
@@ -606,7 +606,7 @@ struct cgroup_subsys {
        void (*cancel_fork)(struct task_struct *task);
        void (*fork)(struct task_struct *task);
        void (*exit)(struct task_struct *task);
-       void (*free)(struct task_struct *task);
+       void (*release)(struct task_struct *task);
        void (*bind)(struct cgroup_subsys_state *root_css);
 
        bool early_init:1;
index 9968332cceed0e64e5fc9bdb814507b0bf67451b..81f58b4a5418da9bf57d1c6f78a2782aeba61377 100644 (file)
@@ -121,6 +121,7 @@ extern int cgroup_can_fork(struct task_struct *p);
 extern void cgroup_cancel_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 void cgroup_exit(struct task_struct *p);
+void cgroup_release(struct task_struct *p);
 void cgroup_free(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -697,6 +698,7 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
 static inline void cgroup_cancel_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
+static inline void cgroup_release(struct task_struct *p) {}
 static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
index 4907c9df86b32ff7f645ced3f687bce85472a18f..ddd45bb74887426419927144a889634d3e5ae998 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/key.h>
-#include <linux/selinux.h>
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
 #include <linux/sched.h>
index 9e21427953351274892c5190056e250517ffcc20..b79fa9bb7359531f294dafb7550bd9fc0208987c 100644 (file)
@@ -19,7 +19,7 @@
                                 FAN_CLASS_PRE_CONTENT)
 
 #define FANOTIFY_INIT_FLAGS    (FANOTIFY_CLASS_BITS | \
-                                FAN_REPORT_TID | \
+                                FAN_REPORT_TID | FAN_REPORT_FID | \
                                 FAN_CLOEXEC | FAN_NONBLOCK | \
                                 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
 
                                 FAN_MARK_IGNORED_SURV_MODIFY | \
                                 FAN_MARK_FLUSH)
 
-/* Events that user can request to be notified on */
-#define FANOTIFY_EVENTS                (FAN_ACCESS | FAN_MODIFY | \
+/*
+ * Events that can be reported with data type FSNOTIFY_EVENT_PATH.
+ * Note that FAN_MODIFY can also be reported with data type
+ * FSNOTIFY_EVENT_INODE.
+ */
+#define FANOTIFY_PATH_EVENTS   (FAN_ACCESS | FAN_MODIFY | \
                                 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC)
 
+/*
+ * Directory entry modification events - reported only to directory
+ * where entry is modified and not to a watching parent.
+ */
+#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE)
+
+/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
+#define FANOTIFY_INODE_EVENTS  (FANOTIFY_DIRENT_EVENTS | \
+                                FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
+
+/* Events that user can request to be notified on */
+#define FANOTIFY_EVENTS                (FANOTIFY_PATH_EVENTS | \
+                                FANOTIFY_INODE_EVENTS)
+
 /* Events that require a permission response from user */
 #define FANOTIFY_PERM_EVENTS   (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
                                 FAN_OPEN_EXEC_PERM)
@@ -49,7 +67,7 @@
 /* Events that may be reported to user */
 #define FANOTIFY_OUTGOING_EVENTS       (FANOTIFY_EVENTS | \
                                         FANOTIFY_PERM_EVENTS | \
-                                        FAN_Q_OVERFLOW)
+                                        FAN_Q_OVERFLOW | FAN_ONDIR)
 
 #define ALL_FANOTIFY_EVENT_BITS                (FANOTIFY_OUTGOING_EVENTS | \
                                         FANOTIFY_EVENT_FLAGS)
index 1a775aa3e349313b824c7554665fa976966a1d6e..2cc540805a02e6d99ca50dd0a6607a0240737b28 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/uuid.h>
 #include <linux/errseq.h>
 #include <linux/ioprio.h>
+#include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 
@@ -1708,22 +1709,6 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
                            u64 phys, u64 len, u32 flags);
 int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
 
-/*
- * File types
- *
- * NOTE! These match bits 12..15 of stat.st_mode
- * (ie "(i_mode >> 12) & 15").
- */
-#define DT_UNKNOWN     0
-#define DT_FIFO                1
-#define DT_CHR         2
-#define DT_DIR         4
-#define DT_BLK         6
-#define DT_REG         8
-#define DT_LNK         10
-#define DT_SOCK                12
-#define DT_WHT         14
-
 /*
  * This is the "filldir" function type, used by readdir() to let
  * the kernel specify what kind of dirent layout it wants to have.
diff --git a/include/linux/fs_types.h b/include/linux/fs_types.h
new file mode 100644 (file)
index 0000000..5481679
--- /dev/null
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_TYPES_H
+#define _LINUX_FS_TYPES_H
+
+/*
+ * This is a header for the common implementation of dirent
+ * to fs on-disk file type conversion.  Although the fs on-disk
+ * bits are specific to every file system, in practice, many
+ * file systems use the exact same on-disk format to describe
+ * the lower 3 file type bits that represent the 7 POSIX file
+ * types.
+ *
+ * It is important to note that the definitions in this
+ * header MUST NOT change. This would break both the
+ * userspace ABI and the on-disk format of filesystems
+ * using this code.
+ *
+ * All those file systems can use this generic code for the
+ * conversions.
+ */
+
+/*
+ * struct dirent file types
+ * exposed to user via getdents(2), readdir(3)
+ *
+ * These match bits 12..15 of stat.st_mode
+ * (ie "(i_mode >> 12) & 15").
+ */
+#define S_DT_SHIFT     12
+#define S_DT(mode)     (((mode) & S_IFMT) >> S_DT_SHIFT)
+#define S_DT_MASK      (S_IFMT >> S_DT_SHIFT)
+
+/* these are defined by POSIX and also present in glibc's dirent.h */
+#define DT_UNKNOWN     0
+#define DT_FIFO                1
+#define DT_CHR         2
+#define DT_DIR         4
+#define DT_BLK         6
+#define DT_REG         8
+#define DT_LNK         10
+#define DT_SOCK                12
+#define DT_WHT         14
+
+#define DT_MAX         (S_DT_MASK + 1) /* 16 */
+
+/*
+ * fs on-disk file types.
+ * Only the low 3 bits are used for the POSIX file types.
+ * Other bits are reserved for fs private use.
+ * These definitions are shared and used by multiple filesystems,
+ * and MUST NOT change under any circumstances.
+ *
+ * Note that no fs currently stores the whiteout type on-disk,
+ * so whiteout dirents are exposed to user as DT_CHR.
+ */
+#define FT_UNKNOWN     0
+#define FT_REG_FILE    1
+#define FT_DIR         2
+#define FT_CHRDEV      3
+#define FT_BLKDEV      4
+#define FT_FIFO                5
+#define FT_SOCK                6
+#define FT_SYMLINK     7
+
+#define FT_MAX         8
+
+/*
+ * declarations for helper functions, accompanying implementation
+ * is in fs/fs_types.c
+ */
+extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
+extern unsigned char fs_umode_to_ftype(umode_t mode);
+extern unsigned char fs_umode_to_dtype(umode_t mode);
+
+#endif
index 2ccb08cb5d6a3ca97ec133f0fabc502035d5dc8b..09587e2860b53e5db5d280925eaebf8f2f421431 100644 (file)
 #include <linux/slab.h>
 #include <linux/bug.h>
 
+/*
+ * Notify this @dir inode about a change in the directory entry @dentry.
+ *
+ * Unlike fsnotify_parent(), the event will be reported regardless of the
+ * FS_EVENT_ON_CHILD mask on the parent inode.
+ */
+static inline int fsnotify_dirent(struct inode *dir, struct dentry *dentry,
+                                 __u32 mask)
+{
+       return fsnotify(dir, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
+                       dentry->d_name.name, 0);
+}
+
 /* Notify this dentry's parent about a child's events. */
-static inline int fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
+static inline int fsnotify_parent(const struct path *path,
+                                 struct dentry *dentry, __u32 mask)
 {
        if (!dentry)
                dentry = path->dentry;
@@ -65,6 +79,9 @@ static inline int fsnotify_perm(struct file *file, int mask)
                fsnotify_mask = FS_ACCESS_PERM;
        }
 
+       if (S_ISDIR(inode->i_mode))
+               fsnotify_mask |= FS_ISDIR;
+
        return fsnotify_path(inode, path, fsnotify_mask);
 }
 
@@ -73,7 +90,12 @@ static inline int fsnotify_perm(struct file *file, int mask)
  */
 static inline void fsnotify_link_count(struct inode *inode)
 {
-       fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+       __u32 mask = FS_ATTRIB;
+
+       if (S_ISDIR(inode->i_mode))
+               mask |= FS_ISDIR;
+
+       fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -81,12 +103,14 @@ static inline void fsnotify_link_count(struct inode *inode)
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const unsigned char *old_name,
-                                int isdir, struct inode *target, struct dentry *moved)
+                                int isdir, struct inode *target,
+                                struct dentry *moved)
 {
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
-       __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
-       __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
+       __u32 old_dir_mask = FS_MOVED_FROM;
+       __u32 new_dir_mask = FS_MOVED_TO;
+       __u32 mask = FS_MOVE_SELF;
        const unsigned char *new_name = moved->d_name.name;
 
        if (old_dir == new_dir)
@@ -95,6 +119,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
+               mask |= FS_ISDIR;
        }
 
        fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
@@ -106,7 +131,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                fsnotify_link_count(target);
 
        if (source)
-               fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+               fsnotify(source, mask, source, FSNOTIFY_EVENT_INODE, NULL, 0);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
 }
 
@@ -128,15 +153,35 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
 
 /*
  * fsnotify_nameremove - a filename was removed from a directory
+ *
+ * This is mostly called under parent vfs inode lock so name and
+ * dentry->d_parent should be stable. However there are some corner cases where
+ * inode lock is not held. So to be on the safe side and be reselient to future
+ * callers and out of tree users of d_delete(), we do not assume that d_parent
+ * and d_name are stable and we use dget_parent() and
+ * take_dentry_name_snapshot() to grab stable references.
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
+       struct dentry *parent;
+       struct name_snapshot name;
        __u32 mask = FS_DELETE;
 
+       /* d_delete() of pseudo inode? (e.g. __ns_get_path() playing tricks) */
+       if (IS_ROOT(dentry))
+               return;
+
        if (isdir)
                mask |= FS_ISDIR;
 
-       fsnotify_parent(NULL, dentry, mask);
+       parent = dget_parent(dentry);
+       take_dentry_name_snapshot(&name, dentry);
+
+       fsnotify(d_inode(parent), mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
+                name.name, 0);
+
+       release_dentry_name_snapshot(&name);
+       dput(parent);
 }
 
 /*
@@ -144,7 +189,12 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
  */
 static inline void fsnotify_inoderemove(struct inode *inode)
 {
-       fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+       __u32 mask = FS_DELETE_SELF;
+
+       if (S_ISDIR(inode->i_mode))
+               mask |= FS_ISDIR;
+
+       fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
        __fsnotify_inode_delete(inode);
 }
 
@@ -155,7 +205,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
        audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-       fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
+       fsnotify_dirent(inode, dentry, FS_CREATE);
 }
 
 /*
@@ -176,12 +226,9 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
-       __u32 mask = (FS_CREATE | FS_ISDIR);
-       struct inode *d_inode = dentry->d_inode;
-
        audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-       fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
+       fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
 }
 
 /*
index 7639774e7475d590ba39aef7327e4db37abdb754..dfc28fcb4de8c1268d14b4cfceac4900bf8bf578 100644 (file)
  * dnotify and inotify. */
 #define FS_EVENT_ON_CHILD      0x08000000
 
-/* This is a list of all events that may get sent to a parernt based on fs event
- * happening to inodes inside that directory */
-#define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
-                                  FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
-                                  FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
-                                  FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM | \
-                                  FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
-
 #define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)
 
+/*
+ * Directory entry modification events - reported only to directory
+ * where entry is modified and not to a watching parent.
+ * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
+ * when a directory entry inside a child subdir changes.
+ */
+#define ALL_FSNOTIFY_DIRENT_EVENTS     (FS_CREATE | FS_DELETE | FS_MOVE)
+
 #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
                                  FS_OPEN_EXEC_PERM)
 
+/*
+ * This is a list of all events that may get sent to a parent based on fs event
+ * happening to inodes inside that directory.
+ */
+#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
+                                  FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
+                                  FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
+                                  FS_OPEN | FS_OPEN_EXEC)
+
 /* Events that can be reported to backends */
-#define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
-                            FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN | \
-                            FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \
-                            FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
-                            FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
-                            FS_OPEN_PERM | FS_ACCESS_PERM | FS_DN_RENAME | \
-                            FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
+#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
+                            FS_EVENTS_POSS_ON_CHILD | \
+                            FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
+                            FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
@@ -129,7 +135,6 @@ struct fsnotify_event {
        struct list_head list;
        /* inode may ONLY be dereferenced during handle_event(). */
        struct inode *inode;    /* either the inode the event happened to or its parent */
-       u32 mask;               /* the type of access, bitwise OR for FS_* event types */
 };
 
 /*
@@ -288,6 +293,7 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
 struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned int type;      /* Type of object [lock] */
+       __kernel_fsid_t fsid;   /* fsid of filesystem containing object */
        union {
                /* Object pointer [lock] */
                fsnotify_connp_t *obj;
@@ -416,6 +422,9 @@ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
 /* return AND dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
+/* Remove event queued in the notification list */
+extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
+                                        struct fsnotify_event *event);
 
 /* functions used to manipulate the marks attached to inodes */
 
@@ -428,28 +437,35 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark,
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
                                                struct fsnotify_group *group);
+/* Get cached fsid of filesystem containing object */
+extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
+                                 __kernel_fsid_t *fsid);
 /* attach the mark to the object */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark,
                             fsnotify_connp_t *connp, unsigned int type,
-                            int allow_dups);
+                            int allow_dups, __kernel_fsid_t *fsid);
 extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-                                   fsnotify_connp_t *connp, unsigned int type,
-                                   int allow_dups);
+                                   fsnotify_connp_t *connp,
+                                   unsigned int type, int allow_dups,
+                                   __kernel_fsid_t *fsid);
+
 /* attach the mark to the inode */
 static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int allow_dups)
 {
        return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
-                                FSNOTIFY_OBJ_TYPE_INODE, allow_dups);
+                                FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL);
 }
 static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int allow_dups)
 {
        return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
-                                       FSNOTIFY_OBJ_TYPE_INODE, allow_dups);
+                                       FSNOTIFY_OBJ_TYPE_INODE, allow_dups,
+                                       NULL);
 }
+
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
@@ -479,9 +495,12 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
 extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
 
-/* put here because inotify does some weird stuff when destroying watches */
-extern void fsnotify_init_event(struct fsnotify_event *event,
-                               struct inode *to_tell, u32 mask);
+static inline void fsnotify_init_event(struct fsnotify_event *event,
+                                      struct inode *inode)
+{
+       INIT_LIST_HEAD(&event->list);
+       event->inode = inode;
+}
 
 #else
 
index e07e91daaacc5c8d476ebe010d37ec6513bb648a..201f0f2683f25bd382267042f9f7dbec8460f093 100644 (file)
@@ -442,6 +442,11 @@ static inline int enable_kprobe(struct kprobe *kp)
 {
        return -ENOSYS;
 }
+
+static inline bool within_kprobe_blacklist(unsigned long addr)
+{
+       return true;
+}
 #endif /* CONFIG_KPROBES */
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
index 9a0bdf91e6467bc1c4dae2629b9608a4047533ef..85a301632cf14402d6ba2d331096700b5320f13a 100644 (file)
  *     @cred contains the credentials to use.
  *     @ns contains the user namespace we want the capability in
  *     @cap contains the capability <include/linux/capability.h>.
- *     @audit contains whether to write an audit message or not
+ *     @opts contains options for the capable check <include/linux/security.h>
  *     Return 0 if the capability is granted for @tsk.
  * @syslog:
  *     Check permission before accessing the kernel message ring or changing
  *     @field contains the field which relates to current LSM.
  *     @op contains the operator that will be used for matching.
  *     @rule points to the audit rule that will be checked against.
- *     @actx points to the audit context associated with the check.
  *     Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure.
  *
  * @audit_rule_free:
@@ -1446,8 +1445,10 @@ union security_list_options {
                        const kernel_cap_t *effective,
                        const kernel_cap_t *inheritable,
                        const kernel_cap_t *permitted);
-       int (*capable)(const struct cred *cred, struct user_namespace *ns,
-                       int cap, int audit);
+       int (*capable)(const struct cred *cred,
+                       struct user_namespace *ns,
+                       int cap,
+                       unsigned int opts);
        int (*quotactl)(int cmds, int type, int id, struct super_block *sb);
        int (*quota_on)(struct dentry *dentry);
        int (*syslog)(int type);
@@ -1764,8 +1765,7 @@ union security_list_options {
        int (*audit_rule_init)(u32 field, u32 op, char *rulestr,
                                void **lsmrule);
        int (*audit_rule_known)(struct audit_krule *krule);
-       int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule,
-                               struct audit_context *actx);
+       int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule);
        void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
 
@@ -2027,6 +2027,18 @@ struct security_hook_list {
        char                            *lsm;
 } __randomize_layout;
 
+/*
+ * Security blob size or offset data.
+ */
+struct lsm_blob_sizes {
+       int     lbs_cred;
+       int     lbs_file;
+       int     lbs_inode;
+       int     lbs_ipc;
+       int     lbs_msg_msg;
+       int     lbs_task;
+};
+
 /*
  * Initializing a security_hook_list structure takes
  * up a lot of space in a source file. This macro takes
@@ -2042,9 +2054,21 @@ extern char *lsm_names;
 extern void security_add_hooks(struct security_hook_list *hooks, int count,
                                char *lsm);
 
+#define LSM_FLAG_LEGACY_MAJOR  BIT(0)
+#define LSM_FLAG_EXCLUSIVE     BIT(1)
+
+enum lsm_order {
+       LSM_ORDER_FIRST = -1,   /* This is only for capabilities. */
+       LSM_ORDER_MUTABLE = 0,
+};
+
 struct lsm_info {
        const char *name;       /* Required. */
+       enum lsm_order order;   /* Optional: default is LSM_ORDER_MUTABLE */
+       unsigned long flags;    /* Optional: flags describing LSM */
+       int *enabled;           /* Optional: controlled by CONFIG_LSM */
        int (*init)(void);      /* Required. */
+       struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
 };
 
 extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
@@ -2084,17 +2108,6 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
 #define __lsm_ro_after_init    __ro_after_init
 #endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
 
-extern int __init security_module_enable(const char *module);
-extern void __init capability_add_hooks(void);
-#ifdef CONFIG_SECURITY_YAMA
-extern void __init yama_add_hooks(void);
-#else
-static inline void __init yama_add_hooks(void) { }
-#endif
-#ifdef CONFIG_SECURITY_LOADPIN
-void __init loadpin_add_hooks(void);
-#else
-static inline void loadpin_add_hooks(void) { };
-#endif
+extern int lsm_inode_alloc(struct inode *inode);
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
index a78606e8e3df7c155410da7a366490354fb9dcb6..9138b4471dbfc250d0811615f87b78b58053bdc1 100644 (file)
@@ -24,6 +24,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  *  - internal "there are more path components" flag
  *  - dentry cache is untrusted; force a real lookup
  *  - suppress terminal automount
+ *  - skip revalidation
+ *  - don't fetch xattrs on audit_inode
  */
 #define LOOKUP_FOLLOW          0x0001
 #define LOOKUP_DIRECTORY       0x0002
@@ -33,6 +35,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_REVAL           0x0020
 #define LOOKUP_RCU             0x0040
 #define LOOKUP_NO_REVAL                0x0080
+#define LOOKUP_NO_EVAL         0x0100
 
 /*
  * Intent data
index f073bd59df32c9214f70df5a9a2190bafdd064b8..1549584a15388a21b6a3d02938da2cfa50c8ccce 100644 (file)
@@ -872,8 +872,10 @@ struct task_struct {
 
        struct callback_head            *task_works;
 
-       struct audit_context            *audit_context;
+#ifdef CONFIG_AUDIT
 #ifdef CONFIG_AUDITSYSCALL
+       struct audit_context            *audit_context;
+#endif
        kuid_t                          loginuid;
        unsigned int                    sessionid;
 #endif
index dbfb5a66babb5ae8ed99b7e13095fc085fba01c8..2b35a43d11d63b1f8c917150216c51e7991390f3 100644 (file)
@@ -54,9 +54,12 @@ struct xattr;
 struct xfrm_sec_ctx;
 struct mm_struct;
 
+/* Default (no) options for the capable function */
+#define CAP_OPT_NONE 0x0
 /* If capable should audit the security request */
-#define SECURITY_CAP_NOAUDIT 0
-#define SECURITY_CAP_AUDIT 1
+#define CAP_OPT_NOAUDIT BIT(1)
+/* If capable is being called by a setid function */
+#define CAP_OPT_INSETID BIT(2)
 
 /* LSM Agnostic defines for sb_set_mnt_opts */
 #define SECURITY_LSM_NATIVE_LABELS     1
@@ -72,7 +75,7 @@ enum lsm_event {
 
 /* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
-                      int cap, int audit);
+                      int cap, unsigned int opts);
 extern int cap_settime(const struct timespec64 *ts, const struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -207,10 +210,10 @@ int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted);
-int security_capable(const struct cred *cred, struct user_namespace *ns,
-                       int cap);
-int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
-                            int cap);
+int security_capable(const struct cred *cred,
+                      struct user_namespace *ns,
+                      int cap,
+                      unsigned int opts);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
@@ -366,8 +369,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                        unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
-int security_getprocattr(struct task_struct *p, char *name, char **value);
-int security_setprocattr(const char *name, void *value, size_t size);
+int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
+                        char **value);
+int security_setprocattr(const char *lsm, const char *name, void *value,
+                        size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
@@ -462,14 +467,11 @@ static inline int security_capset(struct cred *new,
 }
 
 static inline int security_capable(const struct cred *cred,
-                                  struct user_namespace *ns, int cap)
+                                  struct user_namespace *ns,
+                                  int cap,
+                                  unsigned int opts)
 {
-       return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT);
-}
-
-static inline int security_capable_noaudit(const struct cred *cred,
-                                          struct user_namespace *ns, int cap) {
-       return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
+       return cap_capable(cred, ns, cap, opts);
 }
 
 static inline int security_quotactl(int cmds, int type, int id,
@@ -1112,15 +1114,18 @@ static inline int security_sem_semop(struct kern_ipc_perm *sma,
        return 0;
 }
 
-static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode)
+static inline void security_d_instantiate(struct dentry *dentry,
+                                         struct inode *inode)
 { }
 
-static inline int security_getprocattr(struct task_struct *p, char *name, char **value)
+static inline int security_getprocattr(struct task_struct *p, const char *lsm,
+                                      char *name, char **value)
 {
        return -EINVAL;
 }
 
-static inline int security_setprocattr(char *name, void *value, size_t size)
+static inline int security_setprocattr(const char *lsm, char *name,
+                                      void *value, size_t size)
 {
        return -EINVAL;
 }
@@ -1674,8 +1679,7 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer)
 #ifdef CONFIG_SECURITY
 int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule);
 int security_audit_rule_known(struct audit_krule *krule);
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
-                             struct audit_context *actx);
+int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule);
 void security_audit_rule_free(void *lsmrule);
 
 #else
@@ -1692,7 +1696,7 @@ static inline int security_audit_rule_known(struct audit_krule *krule)
 }
 
 static inline int security_audit_rule_match(u32 secid, u32 field, u32 op,
-                                  void *lsmrule, struct audit_context *actx)
+                                           void *lsmrule)
 {
        return 0;
 }
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
deleted file mode 100644 (file)
index 44f4596..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * SELinux services exported to the rest of the kernel.
- *
- * Author: James Morris <jmorris@redhat.com>
- *
- * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
- * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
- * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2,
- * as published by the Free Software Foundation.
- */
-#ifndef _LINUX_SELINUX_H
-#define _LINUX_SELINUX_H
-
-struct selinux_audit_rule;
-struct audit_context;
-struct kern_ipc_perm;
-
-#ifdef CONFIG_SECURITY_SELINUX
-
-/**
- * selinux_is_enabled - is SELinux enabled?
- */
-bool selinux_is_enabled(void);
-#else
-
-static inline bool selinux_is_enabled(void)
-{
-       return false;
-}
-#endif /* CONFIG_SECURITY_SELINUX */
-
-#endif /* _LINUX_SELINUX_H */
index 3142e98546ac9e43b1c35e3649796a77279086b1..9bc69edb8f188fcc63c468c1dd2bd02983a86365 100644 (file)
@@ -41,4 +41,7 @@ struct kstatfs {
 #define ST_NODIRATIME  0x0800  /* do not update directory access times */
 #define ST_RELATIME    0x1000  /* update atime relative to mtime/ctime */
 
+struct dentry;
+extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid);
+
 #endif
index 7c007ed7505f947eab77d85722ba79200c55afcc..54254388899e589431f80b29ab7f387f44822fe6 100644 (file)
@@ -60,9 +60,6 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
                                    size_t size, enum dma_data_direction dir,
                                    enum dma_sync_target target);
 
-extern int
-swiotlb_dma_supported(struct device *hwdev, u64 mask);
-
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 extern phys_addr_t io_tlb_start, io_tlb_end;
index 2887503e4d128fd3ab9caaea8f55cfc16bf07006..ab1cc33adbac6ddb145ea8d698c1b2084dde0cdc 100644 (file)
@@ -1051,6 +1051,7 @@ TRACE_EVENT(btrfs_trigger_flush,
                { FLUSH_DELAYED_REFS_NR,        "FLUSH_DELAYED_REFS_NR"},       \
                { FLUSH_DELAYED_REFS,           "FLUSH_ELAYED_REFS"},           \
                { ALLOC_CHUNK,                  "ALLOC_CHUNK"},                 \
+               { ALLOC_CHUNK_FORCE,            "ALLOC_CHUNK_FORCE"},           \
                { COMMIT_TRANS,                 "COMMIT_TRANS"})
 
 TRACE_EVENT(btrfs_flush_space,
@@ -1512,35 +1513,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
        TP_ARGS(inode, start, len, reserved, op)
 );
 
-DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
-
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
-                u64 ref_root, u64 reserved),
-
-       TP_ARGS(fs_info, ref_root, reserved),
-
-       TP_STRUCT__entry_btrfs(
-               __field(        u64,            ref_root        )
-               __field(        u64,            reserved        )
-       ),
-
-       TP_fast_assign_btrfs(fs_info,
-               __entry->ref_root       = ref_root;
-               __entry->reserved       = reserved;
-       ),
-
-       TP_printk_btrfs("root=%llu reserved=%llu op=free",
-                 __entry->ref_root, __entry->reserved)
-);
-
-DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
-
-       TP_PROTO(const struct btrfs_fs_info *fs_info,
-                u64 ref_root, u64 reserved),
-
-       TP_ARGS(fs_info, ref_root, reserved)
-);
-
 DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup_extent_record *rec),
index e0763bc4158e1a514f4647f4238226caeea642ef..c195896d478f295ee80b667597a6c400696ac0dc 100644 (file)
@@ -837,6 +837,8 @@ enum btrfs_err_code {
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+                                  struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
index 909c98fcace2e48fdc12d7c7716314b85a5dcfd1..b9effa6f8503767a43ba167acd62bce7e94c2eb8 100644 (file)
@@ -7,9 +7,16 @@
 /* the following events that user-space can register for */
 #define FAN_ACCESS             0x00000001      /* File was accessed */
 #define FAN_MODIFY             0x00000002      /* File was modified */
+#define FAN_ATTRIB             0x00000004      /* Metadata changed */
 #define FAN_CLOSE_WRITE                0x00000008      /* Writtable file closed */
 #define FAN_CLOSE_NOWRITE      0x00000010      /* Unwrittable file closed */
 #define FAN_OPEN               0x00000020      /* File was opened */
+#define FAN_MOVED_FROM         0x00000040      /* File was moved from X */
+#define FAN_MOVED_TO           0x00000080      /* File was moved to Y */
+#define FAN_CREATE             0x00000100      /* Subfile was created */
+#define FAN_DELETE             0x00000200      /* Subfile was deleted */
+#define FAN_DELETE_SELF                0x00000400      /* Self was deleted */
+#define FAN_MOVE_SELF          0x00000800      /* Self was moved */
 #define FAN_OPEN_EXEC          0x00001000      /* File was opened for exec */
 
 #define FAN_Q_OVERFLOW         0x00004000      /* Event queued overflowed */
@@ -24,6 +31,7 @@
 
 /* helper events */
 #define FAN_CLOSE              (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
+#define FAN_MOVE               (FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */
 
 /* flags used for fanotify_init() */
 #define FAN_CLOEXEC            0x00000001
@@ -44,6 +52,7 @@
 
 /* Flags to determine fanotify event format */
 #define FAN_REPORT_TID         0x00000100      /* event->pid is thread id */
+#define FAN_REPORT_FID         0x00000200      /* Report unique file id */
 
 /* Deprecated - do not use this in programs and do not add new flags here! */
 #define FAN_ALL_INIT_FLAGS     (FAN_CLOEXEC | FAN_NONBLOCK | \
@@ -106,6 +115,26 @@ struct fanotify_event_metadata {
        __s32 pid;
 };
 
+#define FAN_EVENT_INFO_TYPE_FID                1
+
+/* Variable length info record following event metadata */
+struct fanotify_event_info_header {
+       __u8 info_type;
+       __u8 pad;
+       __u16 len;
+};
+
+/* Unique file identifier info record */
+struct fanotify_event_info_fid {
+       struct fanotify_event_info_header hdr;
+       __kernel_fsid_t fsid;
+       /*
+        * Following is an opaque struct file_handle that can be passed as
+        * an argument to open_by_handle_at(2).
+        */
+       unsigned char handle[0];
+};
+
 struct fanotify_response {
        __s32 fd;
        __u32 response;
index df0257c5928c17f31823ebf9bf4eee7e287ff383..c70ef656d0f40cf4fc603d0a7dbb70ba1820af73 100644 (file)
@@ -122,7 +122,7 @@ struct task_struct init_task
        .thread_pid     = &init_struct_pid,
        .thread_group   = LIST_HEAD_INIT(init_task.thread_group),
        .thread_node    = LIST_HEAD_INIT(init_signals.thread_head),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
        .loginuid       = INVALID_UID,
        .sessionid      = AUDIT_SID_UNSET,
 #endif
index 632d360595560b8d2a0d0a1305054d5b9432cfa6..c89ea48c70a6ef70adef41122dfc2bda62cebc36 100644 (file)
@@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
        struct audit_buffer *ab;
        int rc = 0;
 
-       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return rc;
-       audit_log_format(ab, "%s=%u old=%u ", function_name, new, old);
+       audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
        audit_log_session_info(ab);
        rc = audit_log_task_context(ab);
        if (rc)
@@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        return err;
 }
 
-static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_context *context,
+                                       struct audit_buffer **ab, u16 msg_type)
 {
        uid_t uid = from_kuid(&init_user_ns, current_uid());
        pid_t pid = task_tgid_nr(current);
@@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
                return;
        }
 
-       *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+       *ab = audit_log_start(context, GFP_KERNEL, msg_type);
        if (unlikely(!*ab))
                return;
        audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
@@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
        audit_log_task_context(*ab);
 }
 
+static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
+                                          u16 msg_type)
+{
+       audit_log_common_recv_msg(NULL, ab, msg_type);
+}
+
 int is_audit_feature_set(int i)
 {
        return af.features & AUDIT_FEATURE_TO_MASK(i);
@@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                if (err)
                                        break;
                        }
-                       audit_log_common_recv_msg(&ab, msg_type);
+                       audit_log_user_recv_msg(&ab, msg_type);
                        if (msg_type != AUDIT_USER_TTY)
                                audit_log_format(ab, " msg='%.*s'",
                                                 AUDIT_MESSAGE_TEXT_MAX,
@@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
-                       audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-                       audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+                       audit_log_common_recv_msg(audit_context(), &ab,
+                                                 AUDIT_CONFIG_CHANGE);
+                       audit_log_format(ab, " op=%s audit_enabled=%d res=0",
+                                        msg_type == AUDIT_ADD_RULE ?
+                                               "add_rule" : "remove_rule",
+                                        audit_enabled);
                        audit_log_end(ab);
                        return -EPERM;
                }
@@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
-               audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+               audit_log_common_recv_msg(audit_context(), &ab,
+                                         AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
                break;
@@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                /* OK, here comes... */
                err = audit_tag_tree(old, new);
 
-               audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-
+               audit_log_common_recv_msg(audit_context(), &ab,
+                                         AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
                audit_log_format(ab, " new=");
@@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                old.enabled = t & AUDIT_TTY_ENABLE;
                old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
 
-               audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+               audit_log_common_recv_msg(audit_context(), &ab,
+                                         AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
                                 " old-log_passwd=%d new-log_passwd=%d res=%d",
                                 old.enabled, s.enabled, old.log_passwd,
@@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
                audit_log_format(ab, "(null)");
 }
 
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
-       int i;
-
-       if (cap_isclear(*cap)) {
-               audit_log_format(ab, " %s=0", prefix);
-               return;
-       }
-       audit_log_format(ab, " %s=", prefix);
-       CAP_FOR_EACH_U32(i)
-               audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
-       audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
-       audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
-       audit_log_format(ab, " cap_fe=%d cap_fver=%x",
-                        name->fcap.fE, name->fcap_ver);
-}
-
-static inline int audit_copy_fcaps(struct audit_names *name,
-                                  const struct dentry *dentry)
-{
-       struct cpu_vfs_cap_data caps;
-       int rc;
-
-       if (!dentry)
-               return 0;
-
-       rc = get_vfs_caps_from_disk(dentry, &caps);
-       if (rc)
-               return rc;
-
-       name->fcap.permitted = caps.permitted;
-       name->fcap.inheritable = caps.inheritable;
-       name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
-       name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
-                               VFS_CAP_REVISION_SHIFT;
-
-       return 0;
-}
-
-/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
-                     struct inode *inode)
-{
-       name->ino   = inode->i_ino;
-       name->dev   = inode->i_sb->s_dev;
-       name->mode  = inode->i_mode;
-       name->uid   = inode->i_uid;
-       name->gid   = inode->i_gid;
-       name->rdev  = inode->i_rdev;
-       security_inode_getsecid(inode, &name->osid);
-       audit_copy_fcaps(name, dentry);
-}
-
-/**
- * audit_log_name - produce AUDIT_PATH record from struct audit_names
- * @context: audit_context for the task
- * @n: audit_names structure with reportable details
- * @path: optional path to report instead of audit_names->name
- * @record_num: record number to report when handling a list of names
- * @call_panic: optional pointer to int that will be updated if secid fails
- */
-void audit_log_name(struct audit_context *context, struct audit_names *n,
-                   const struct path *path, int record_num, int *call_panic)
-{
-       struct audit_buffer *ab;
-       ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-       if (!ab)
-               return;
-
-       audit_log_format(ab, "item=%d", record_num);
-
-       if (path)
-               audit_log_d_path(ab, " name=", path);
-       else if (n->name) {
-               switch (n->name_len) {
-               case AUDIT_NAME_FULL:
-                       /* log the full path */
-                       audit_log_format(ab, " name=");
-                       audit_log_untrustedstring(ab, n->name->name);
-                       break;
-               case 0:
-                       /* name was specified as a relative path and the
-                        * directory component is the cwd */
-                       audit_log_d_path(ab, " name=", &context->pwd);
-                       break;
-               default:
-                       /* log the name's directory component */
-                       audit_log_format(ab, " name=");
-                       audit_log_n_untrustedstring(ab, n->name->name,
-                                                   n->name_len);
-               }
-       } else
-               audit_log_format(ab, " name=(null)");
-
-       if (n->ino != AUDIT_INO_UNSET)
-               audit_log_format(ab, " inode=%lu"
-                                " dev=%02x:%02x mode=%#ho"
-                                " ouid=%u ogid=%u rdev=%02x:%02x",
-                                n->ino,
-                                MAJOR(n->dev),
-                                MINOR(n->dev),
-                                n->mode,
-                                from_kuid(&init_user_ns, n->uid),
-                                from_kgid(&init_user_ns, n->gid),
-                                MAJOR(n->rdev),
-                                MINOR(n->rdev));
-       if (n->osid != 0) {
-               char *ctx = NULL;
-               u32 len;
-               if (security_secid_to_secctx(
-                       n->osid, &ctx, &len)) {
-                       audit_log_format(ab, " osid=%u", n->osid);
-                       if (call_panic)
-                               *call_panic = 2;
-               } else {
-                       audit_log_format(ab, " obj=%s", ctx);
-                       security_release_secctx(ctx, len);
-               }
-       }
-
-       /* log the audit_names record type */
-       switch(n->type) {
-       case AUDIT_TYPE_NORMAL:
-               audit_log_format(ab, " nametype=NORMAL");
-               break;
-       case AUDIT_TYPE_PARENT:
-               audit_log_format(ab, " nametype=PARENT");
-               break;
-       case AUDIT_TYPE_CHILD_DELETE:
-               audit_log_format(ab, " nametype=DELETE");
-               break;
-       case AUDIT_TYPE_CHILD_CREATE:
-               audit_log_format(ab, " nametype=CREATE");
-               break;
-       default:
-               audit_log_format(ab, " nametype=UNKNOWN");
-               break;
-       }
-
-       audit_log_fcaps(ab, n);
-       audit_log_end(ab);
-}
-
 int audit_log_task_context(struct audit_buffer *ab)
 {
        char *ctx = NULL;
@@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation)
        audit_log_end(ab);
 }
 
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+       /* if we are unset, we don't need privs */
+       if (!audit_loginuid_set(current))
+               return 0;
+       /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+       if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+               return -EPERM;
+       /* it is set, you need permission */
+       if (!capable(CAP_AUDIT_CONTROL))
+               return -EPERM;
+       /* reject if this is not an unset and we don't allow that */
+       if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
+                                && uid_valid(loginuid))
+               return -EPERM;
+       return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+                                  unsigned int oldsessionid,
+                                  unsigned int sessionid, int rc)
+{
+       struct audit_buffer *ab;
+       uid_t uid, oldloginuid, loginuid;
+       struct tty_struct *tty;
+
+       if (!audit_enabled)
+               return;
+
+       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+       if (!ab)
+               return;
+
+       uid = from_kuid(&init_user_ns, task_uid(current));
+       oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+       loginuid = from_kuid(&init_user_ns, kloginuid),
+       tty = audit_get_tty();
+
+       audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
+       audit_log_task_context(ab);
+       audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+                        oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+                        oldsessionid, sessionid, !rc);
+       audit_put_tty(tty);
+       audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+       unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
+       kuid_t oldloginuid;
+       int rc;
+
+       oldloginuid = audit_get_loginuid(current);
+       oldsessionid = audit_get_sessionid(current);
+
+       rc = audit_set_loginuid_perm(loginuid);
+       if (rc)
+               goto out;
+
+       /* are we setting or clearing? */
+       if (uid_valid(loginuid)) {
+               sessionid = (unsigned int)atomic_inc_return(&session_id);
+               if (unlikely(sessionid == AUDIT_SID_UNSET))
+                       sessionid = (unsigned int)atomic_inc_return(&session_id);
+       }
+
+       current->sessionid = sessionid;
+       current->loginuid = loginuid;
+out:
+       audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+       return rc;
+}
+
 /**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
index 91421679a16889d75706d08dc7367ed5eee8812f..958d5b8fc1b3c8e2c43affb346b2e89e8a188f5c 100644 (file)
@@ -69,6 +69,7 @@ struct audit_cap_data {
                kernel_cap_t    effective;      /* effective set of process */
        };
        kernel_cap_t            ambient;
+       kuid_t                  rootid;
 };
 
 /* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -212,15 +213,6 @@ extern bool audit_ever_enabled;
 
 extern void audit_log_session_info(struct audit_buffer *ab);
 
-extern void audit_copy_inode(struct audit_names *name,
-                            const struct dentry *dentry,
-                            struct inode *inode);
-extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
-                         kernel_cap_t *cap);
-extern void audit_log_name(struct audit_context *context,
-                          struct audit_names *n, const struct path *path,
-                          int record_num, int *call_panic);
-
 extern int auditd_test_task(struct task_struct *task);
 
 #define AUDIT_INODE_BUCKETS    32
@@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
 extern struct tty_struct *audit_get_tty(void);
 extern void audit_put_tty(struct tty_struct *tty);
 
-/* audit watch functions */
+/* audit watch/mark/tree functions */
 #ifdef CONFIG_AUDITSYSCALL
+extern unsigned int audit_serial(void);
+extern int auditsc_get_stamp(struct audit_context *ctx,
+                             struct timespec64 *t, unsigned int *serial);
+
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
-extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
+                         u32 op);
 extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
 extern void audit_remove_watch_rule(struct audit_krule *krule);
 extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
+                              dev_t dev);
 
-extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
+                                                   char *pathname, int len);
 extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
 extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
 extern void audit_remove_mark_rule(struct audit_krule *krule);
-extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
+                             unsigned long ino, dev_t dev);
 extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
-extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+extern int audit_exe_compare(struct task_struct *tsk,
+                            struct audit_fsnotify_mark *mark);
+
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk,
+                            struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct audit_context *context);
 
-#else
+extern int audit_signal_info(int sig, struct task_struct *t);
+extern void audit_filter_inodes(struct task_struct *tsk,
+                               struct audit_context *ctx);
+extern struct list_head *audit_killed_trees(void);
+#else /* CONFIG_AUDITSYSCALL */
+#define auditsc_get_stamp(c, t, s) 0
 #define audit_put_watch(w) {}
 #define audit_get_watch(w) {}
 #define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
 #define audit_mark_compare(m, i, d) 0
 #define audit_exe_compare(t, m) (-EINVAL)
 #define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDITSYSCALL */
 
-#ifdef CONFIG_AUDITSYSCALL
-extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
-extern void audit_put_chunk(struct audit_chunk *chunk);
-extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
-extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
-extern int audit_add_tree_rule(struct audit_krule *rule);
-extern int audit_remove_tree_rule(struct audit_krule *rule);
-extern void audit_trim_trees(void);
-extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *tree);
-extern void audit_put_tree(struct audit_tree *tree);
-extern void audit_kill_trees(struct list_head *list);
-#else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
 #define audit_make_tree(rule, str, op) -EINVAL
@@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list);
 #define audit_put_tree(tree) (void)0
 #define audit_tag_tree(old, new) -EINVAL
 #define audit_tree_path(rule) ""       /* never called */
-#define audit_kill_trees(list) BUG()
-#endif
+#define audit_kill_trees(context) BUG()
+
+#define audit_signal_info(s, t) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#endif /* CONFIG_AUDITSYSCALL */
 
 extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
 
@@ -334,14 +342,5 @@ extern u32 audit_sig_sid;
 
 extern int audit_filter(int msgtype, unsigned int listtype);
 
-#ifdef CONFIG_AUDITSYSCALL
-extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
-extern struct list_head *audit_killed_trees(void);
-#else
-#define audit_signal_info(s,t) AUDIT_DISABLED
-#define audit_filter_inodes(t,c) AUDIT_DISABLED
-#endif
-
 extern void audit_ctl_lock(void);
 extern void audit_ctl_unlock(void);
index cf4512a33675834b0389519b69db6d3c7d6ed0ad..37ae95cfb7f45aa8ce2a8eb92cb15f15dc4eddc6 100644 (file)
@@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
 
        if (!audit_enabled)
                return;
-       ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+       ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
        audit_log_session_info(ab);
index d4af4d97f847a84b8303188ec66c7299a56af2e4..abfb112f26aa41e8efe8fe1824bd1f7785057999 100644 (file)
@@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        return 0;
 }
 
-static void audit_tree_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_context *context,
+                                      struct audit_krule *rule)
 {
        struct audit_buffer *ab;
 
        if (!audit_enabled)
                return;
-       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+       ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return;
        audit_log_format(ab, "op=remove_rule dir=");
@@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
        audit_log_end(ab);
 }
 
-static void kill_rules(struct audit_tree *tree)
+static void kill_rules(struct audit_context *context, struct audit_tree *tree)
 {
        struct audit_krule *rule, *next;
        struct audit_entry *entry;
@@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree)
                list_del_init(&rule->rlist);
                if (rule->tree) {
                        /* not a half-baked one */
-                       audit_tree_log_remove_rule(rule);
+                       audit_tree_log_remove_rule(context, rule);
                        if (entry->rule.exe)
                                audit_remove_mark(entry->rule.exe);
                        rule->tree = NULL;
@@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree)
                tree->goner = 1;
                spin_unlock(&hash_lock);
                mutex_lock(&audit_filter_mutex);
-               kill_rules(tree);
+               kill_rules(audit_context(), tree);
                list_del_init(&tree->list);
                mutex_unlock(&audit_filter_mutex);
                prune_one(tree);
@@ -973,8 +974,10 @@ static void audit_schedule_prune(void)
  * ... and that one is done if evict_chunk() decides to delay until the end
  * of syscall.  Runs synchronously.
  */
-void audit_kill_trees(struct list_head *list)
+void audit_kill_trees(struct audit_context *context)
 {
+       struct list_head *list = &context->killed_trees;
+
        audit_ctl_lock();
        mutex_lock(&audit_filter_mutex);
 
@@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list)
                struct audit_tree *victim;
 
                victim = list_entry(list->next, struct audit_tree, list);
-               kill_rules(victim);
+               kill_rules(context, victim);
                list_del_init(&victim->list);
 
                mutex_unlock(&audit_filter_mutex);
@@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk)
                list_del_init(&owner->same_root);
                spin_unlock(&hash_lock);
                if (!postponed) {
-                       kill_rules(owner);
+                       kill_rules(audit_context(), owner);
                        list_move(&owner->list, &prune_list);
                        need_prune = 1;
                } else {
index 20ef9ba134b0e7402e2fdca5639f66ce9a1c8359..e8d1adeb22230b1df4a33762fb17f0f7f014668e 100644 (file)
@@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 
        if (!audit_enabled)
                return;
-       ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+       ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
        audit_log_session_info(ab);
index bf309f2592c461a93dc74423e9d94ff248408e85..63f8b3f26fab452b171fd5ba662d3e1b801999c3 100644 (file)
@@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                                data->values[i] = AUDIT_UID_UNSET;
                                break;
                        }
-                       /* fallthrough if set */
+                       /* fall through - if set */
                default:
                        data->values[i] = f->val;
                }
@@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
        if (!audit_enabled)
                return;
 
-       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
        audit_log_session_info(ab);
@@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype)
                                if (f->lsm_rule) {
                                        security_task_getsecid(current, &sid);
                                        result = security_audit_rule_match(sid,
-                                                       f->type, f->op, f->lsm_rule, NULL);
+                                                  f->type, f->op, f->lsm_rule);
                                }
                                break;
                        case AUDIT_EXE:
index 6593a5207fb03f28a24bf1dfd6a52bcc77412024..d1eab1d4a930e6f509fcddbf10ede58d1a94db6f 100644 (file)
@@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                                        need_sid = 0;
                                }
                                result = security_audit_rule_match(sid, f->type,
-                                                                 f->op,
-                                                                 f->lsm_rule,
-                                                                 ctx);
+                                                                  f->op,
+                                                                  f->lsm_rule);
                        }
                        break;
                case AUDIT_OBJ_USER:
@@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk,
                                /* Find files that match */
                                if (name) {
                                        result = security_audit_rule_match(
-                                                  name->osid, f->type, f->op,
-                                                  f->lsm_rule, ctx);
+                                                               name->osid,
+                                                               f->type,
+                                                               f->op,
+                                                               f->lsm_rule);
                                } else if (ctx) {
                                        list_for_each_entry(n, &ctx->names_list, list) {
-                                               if (security_audit_rule_match(n->osid, f->type,
-                                                                             f->op, f->lsm_rule,
-                                                                             ctx)) {
+                                               if (security_audit_rule_match(
+                                                               n->osid,
+                                                               f->type,
+                                                               f->op,
+                                                               f->lsm_rule)) {
                                                        ++result;
                                                        break;
                                                }
@@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                                        break;
                                if (security_audit_rule_match(ctx->ipc.osid,
                                                              f->type, f->op,
-                                                             f->lsm_rule, ctx))
+                                                             f->lsm_rule))
                                        ++result;
                        }
                        break;
@@ -1136,6 +1139,32 @@ out:
        kfree(buf_head);
 }
 
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+       int i;
+
+       if (cap_isclear(*cap)) {
+               audit_log_format(ab, " %s=0", prefix);
+               return;
+       }
+       audit_log_format(ab, " %s=", prefix);
+       CAP_FOR_EACH_U32(i)
+               audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+       if (name->fcap_ver == -1) {
+               audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
+               return;
+       }
+       audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+       audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+       audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
+                        name->fcap.fE, name->fcap_ver,
+                        from_kuid(&init_user_ns, name->fcap.rootid));
+}
+
 static void show_special(struct audit_context *context, int *call_panic)
 {
        struct audit_buffer *ab;
@@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
        return len;
 }
 
+/*
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+                   const struct path *path, int record_num, int *call_panic)
+{
+       struct audit_buffer *ab;
+
+       ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+       if (!ab)
+               return;
+
+       audit_log_format(ab, "item=%d", record_num);
+
+       if (path)
+               audit_log_d_path(ab, " name=", path);
+       else if (n->name) {
+               switch (n->name_len) {
+               case AUDIT_NAME_FULL:
+                       /* log the full path */
+                       audit_log_format(ab, " name=");
+                       audit_log_untrustedstring(ab, n->name->name);
+                       break;
+               case 0:
+                       /* name was specified as a relative path and the
+                        * directory component is the cwd
+                        */
+                       audit_log_d_path(ab, " name=", &context->pwd);
+                       break;
+               default:
+                       /* log the name's directory component */
+                       audit_log_format(ab, " name=");
+                       audit_log_n_untrustedstring(ab, n->name->name,
+                                                   n->name_len);
+               }
+       } else
+               audit_log_format(ab, " name=(null)");
+
+       if (n->ino != AUDIT_INO_UNSET)
+               audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+                                n->ino,
+                                MAJOR(n->dev),
+                                MINOR(n->dev),
+                                n->mode,
+                                from_kuid(&init_user_ns, n->uid),
+                                from_kgid(&init_user_ns, n->gid),
+                                MAJOR(n->rdev),
+                                MINOR(n->rdev));
+       if (n->osid != 0) {
+               char *ctx = NULL;
+               u32 len;
+
+               if (security_secid_to_secctx(
+                       n->osid, &ctx, &len)) {
+                       audit_log_format(ab, " osid=%u", n->osid);
+                       if (call_panic)
+                               *call_panic = 2;
+               } else {
+                       audit_log_format(ab, " obj=%s", ctx);
+                       security_release_secctx(ctx, len);
+               }
+       }
+
+       /* log the audit_names record type */
+       switch (n->type) {
+       case AUDIT_TYPE_NORMAL:
+               audit_log_format(ab, " nametype=NORMAL");
+               break;
+       case AUDIT_TYPE_PARENT:
+               audit_log_format(ab, " nametype=PARENT");
+               break;
+       case AUDIT_TYPE_CHILD_DELETE:
+               audit_log_format(ab, " nametype=DELETE");
+               break;
+       case AUDIT_TYPE_CHILD_CREATE:
+               audit_log_format(ab, " nametype=CREATE");
+               break;
+       default:
+               audit_log_format(ab, " nametype=UNKNOWN");
+               break;
+       }
+
+       audit_log_fcaps(ab, n);
+       audit_log_end(ab);
+}
+
 static void audit_log_proctitle(void)
 {
        int res;
@@ -1358,6 +1478,9 @@ static void audit_log_exit(void)
                        audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
                        audit_log_cap(ab, "pe", &axs->new_pcap.effective);
                        audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
+                       audit_log_format(ab, " frootid=%d",
+                                        from_kuid(&init_user_ns,
+                                                  axs->fcap.rootid));
                        break; }
 
                }
@@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk)
        if (!context)
                return;
 
+       if (!list_empty(&context->killed_trees))
+               audit_kill_trees(context);
+
        /* We are called either by do_exit() or the fork() error handling code;
         * in the former case tsk == current and in the latter tsk is a
         * random task_struct that doesn't doesn't have any meaningful data we
@@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk)
                        audit_log_exit();
        }
 
-       if (!list_empty(&context->killed_trees))
-               audit_kill_trees(&context->killed_trees);
-
        audit_set_context(tsk, NULL);
        audit_free_context(context);
 }
@@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code)
        if (!context)
                return;
 
+       if (!list_empty(&context->killed_trees))
+               audit_kill_trees(context);
+
        if (!context->dummy && context->in_syscall) {
                if (success)
                        context->return_valid = AUDITSC_SUCCESS;
@@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code)
        context->in_syscall = 0;
        context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 
-       if (!list_empty(&context->killed_trees))
-               audit_kill_trees(&context->killed_trees);
-
        audit_free_names(context);
        unroll_tree_refs(context, NULL, 0);
        audit_free_aux(context);
@@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name)
                get_fs_pwd(current->fs, &context->pwd);
 }
 
+static inline int audit_copy_fcaps(struct audit_names *name,
+                                  const struct dentry *dentry)
+{
+       struct cpu_vfs_cap_data caps;
+       int rc;
+
+       if (!dentry)
+               return 0;
+
+       rc = get_vfs_caps_from_disk(dentry, &caps);
+       if (rc)
+               return rc;
+
+       name->fcap.permitted = caps.permitted;
+       name->fcap.inheritable = caps.inheritable;
+       name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+       name->fcap.rootid = caps.rootid;
+       name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+                               VFS_CAP_REVISION_SHIFT;
+
+       return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+                     struct inode *inode, unsigned int flags)
+{
+       name->ino   = inode->i_ino;
+       name->dev   = inode->i_sb->s_dev;
+       name->mode  = inode->i_mode;
+       name->uid   = inode->i_uid;
+       name->gid   = inode->i_gid;
+       name->rdev  = inode->i_rdev;
+       security_inode_getsecid(inode, &name->osid);
+       if (flags & AUDIT_INODE_NOEVAL) {
+               name->fcap_ver = -1;
+               return;
+       }
+       audit_copy_fcaps(name, dentry);
+}
+
 /**
  * __audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
        struct inode *inode = d_backing_inode(dentry);
        struct audit_names *n;
        bool parent = flags & AUDIT_INODE_PARENT;
+       struct audit_entry *e;
+       struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+       int i;
 
        if (!context->in_syscall)
                return;
 
+       rcu_read_lock();
+       if (!list_empty(list)) {
+               list_for_each_entry_rcu(e, list, list) {
+                       for (i = 0; i < e->rule.field_count; i++) {
+                               struct audit_field *f = &e->rule.fields[i];
+
+                               if (f->type == AUDIT_FSTYPE
+                                   && audit_comparator(inode->i_sb->s_magic,
+                                                       f->op, f->val)
+                                   && e->rule.action == AUDIT_NEVER) {
+                                       rcu_read_unlock();
+                                       return;
+                               }
+                       }
+               }
+       }
+       rcu_read_unlock();
+
        if (!name)
                goto out_alloc;
 
@@ -1832,7 +2017,7 @@ out:
                n->type = AUDIT_TYPE_NORMAL;
        }
        handle_path(dentry);
-       audit_copy_inode(n, dentry, inode);
+       audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
 }
 
 void __audit_file(const struct file *file)
@@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent,
                        for (i = 0; i < e->rule.field_count; i++) {
                                struct audit_field *f = &e->rule.fields[i];
 
-                               if (f->type == AUDIT_FSTYPE) {
-                                       if (audit_comparator(parent->i_sb->s_magic,
-                                           f->op, f->val)) {
-                                               if (e->rule.action == AUDIT_NEVER) {
-                                                       rcu_read_unlock();
-                                                       return;
-                                               }
-                                       }
+                               if (f->type == AUDIT_FSTYPE
+                                   && audit_comparator(parent->i_sb->s_magic,
+                                                       f->op, f->val)
+                                   && e->rule.action == AUDIT_NEVER) {
+                                       rcu_read_unlock();
+                                       return;
                                }
                        }
                }
@@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
                n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
                if (!n)
                        return;
-               audit_copy_inode(n, NULL, parent);
+               audit_copy_inode(n, NULL, parent, 0);
        }
 
        if (!found_child) {
@@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent,
        }
 
        if (inode)
-               audit_copy_inode(found_child, dentry, inode);
+               audit_copy_inode(found_child, dentry, inode, 0);
        else
                found_child->ino = AUDIT_INO_UNSET;
 }
@@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
        return 1;
 }
 
-/* global counter which is incremented every time something logs in */
-static atomic_t session_id = ATOMIC_INIT(0);
-
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
-       /* if we are unset, we don't need privs */
-       if (!audit_loginuid_set(current))
-               return 0;
-       /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
-       if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
-               return -EPERM;
-       /* it is set, you need permission */
-       if (!capable(CAP_AUDIT_CONTROL))
-               return -EPERM;
-       /* reject if this is not an unset and we don't allow that */
-       if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
-               return -EPERM;
-       return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
-                                  unsigned int oldsessionid, unsigned int sessionid,
-                                  int rc)
-{
-       struct audit_buffer *ab;
-       uid_t uid, oldloginuid, loginuid;
-       struct tty_struct *tty;
-
-       if (!audit_enabled)
-               return;
-
-       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-       if (!ab)
-               return;
-
-       uid = from_kuid(&init_user_ns, task_uid(current));
-       oldloginuid = from_kuid(&init_user_ns, koldloginuid);
-       loginuid = from_kuid(&init_user_ns, kloginuid),
-       tty = audit_get_tty();
-
-       audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
-       audit_log_task_context(ab);
-       audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
-                        oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
-                        oldsessionid, sessionid, !rc);
-       audit_put_tty(tty);
-       audit_log_end(ab);
-}
-
-/**
- * audit_set_loginuid - set current task's audit_context loginuid
- * @loginuid: loginuid value
- *
- * Returns 0.
- *
- * Called (set) from fs/proc/base.c::proc_loginuid_write().
- */
-int audit_set_loginuid(kuid_t loginuid)
-{
-       unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
-       kuid_t oldloginuid;
-       int rc;
-
-       oldloginuid = audit_get_loginuid(current);
-       oldsessionid = audit_get_sessionid(current);
-
-       rc = audit_set_loginuid_perm(loginuid);
-       if (rc)
-               goto out;
-
-       /* are we setting or clearing? */
-       if (uid_valid(loginuid)) {
-               sessionid = (unsigned int)atomic_inc_return(&session_id);
-               if (unlikely(sessionid == AUDIT_SID_UNSET))
-                       sessionid = (unsigned int)atomic_inc_return(&session_id);
-       }
-
-       current->sessionid = sessionid;
-       current->loginuid = loginuid;
-out:
-       audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
-       return rc;
-}
-
 /**
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
@@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
        ax->fcap.permitted = vcaps.permitted;
        ax->fcap.inheritable = vcaps.inheritable;
        ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+       ax->fcap.rootid = vcaps.rootid;
        ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
        ax->old_pcap.permitted   = old->cap_permitted;
index 1e1c0236f55b5b05b54caa49dca62c1b7e2d5819..1444f3954d750ba685b9423e94522e0243175f90 100644 (file)
@@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
-               /*
-                * fall through - v3 is otherwise equivalent to v2.
-                */
+               /* fall through - v3 is otherwise equivalent to v2. */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
@@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t,
        int ret;
 
        rcu_read_lock();
-       ret = security_capable(__task_cred(t), ns, cap);
+       ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();
 
        return (ret == 0);
@@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
        int ret;
 
        rcu_read_lock();
-       ret = security_capable_noaudit(__task_cred(t), ns, cap);
+       ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();
 
        return (ret == 0);
@@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
 
-static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+static bool ns_capable_common(struct user_namespace *ns,
+                             int cap,
+                             unsigned int opts)
 {
        int capable;
 
@@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
                BUG();
        }
 
-       capable = audit ? security_capable(current_cred(), ns, cap) :
-                         security_capable_noaudit(current_cred(), ns, cap);
+       capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
@@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
  */
 bool ns_capable(struct user_namespace *ns, int cap)
 {
-       return ns_capable_common(ns, cap, true);
+       return ns_capable_common(ns, cap, CAP_OPT_NONE);
 }
 EXPORT_SYMBOL(ns_capable);
 
@@ -412,10 +411,29 @@ EXPORT_SYMBOL(ns_capable);
  */
 bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 {
-       return ns_capable_common(ns, cap, false);
+       return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
 }
 EXPORT_SYMBOL(ns_capable_noaudit);
 
+/**
+ * ns_capable_setid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid syscall.
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+       return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_setid);
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
@@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable);
 bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
 {
+
        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;
 
-       if (security_capable(file->f_cred, ns, cap) == 0)
+       if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;
 
        return false;
@@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
 {
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;
+
        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
-               ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+               ret = security_capable(cred, ns, CAP_SYS_PTRACE,
+                                      CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
 }
index 17828333f7c3e015aa3491c6807030e1210fe031..eef24a25bda7a2e42c6a7031225b9018290b7e32 100644 (file)
@@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1;
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
 static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
@@ -5326,7 +5326,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
-       have_free_callback |= (bool)ss->free << ss->id;
+       have_release_callback |= (bool)ss->release << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
        /* At system boot, before all subsystems have been
@@ -5762,16 +5762,19 @@ void cgroup_exit(struct task_struct *tsk)
        } while_each_subsys_mask();
 }
 
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
 {
-       struct css_set *cset = task_css_set(task);
        struct cgroup_subsys *ss;
        int ssid;
 
-       do_each_subsys_mask(ss, ssid, have_free_callback) {
-               ss->free(task);
+       do_each_subsys_mask(ss, ssid, have_release_callback) {
+               ss->release(task);
        } while_each_subsys_mask();
+}
 
+void cgroup_free(struct task_struct *task)
+{
+       struct css_set *cset = task_css_set(task);
        put_css_set(cset);
 }
 
index 479743db6c37093f0f6d4e6eebb9d127ab011050..72afd55f70c64fa936208b5d92db01334965e0a9 100644 (file)
@@ -203,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
        return css_cs(cs->css.parent);
 }
 
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-       return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-       return false;
-}
-#endif
-
-
 /* bits in struct cpuset flags field */
 typedef enum {
        CS_ONLINE,
index 9829c67ebc0ae0236af31913b2ce858b0a2c33b2..c9960baaa14f23f7b85a47b8f6f067da083c21cc 100644 (file)
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
        pids_uncharge(pids, 1);
 }
 
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
 {
        struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
        .cancel_attach  = pids_cancel_attach,
        .can_fork       = pids_can_fork,
        .cancel_fork    = pids_cancel_fork,
-       .free           = pids_free,
+       .release        = pids_release,
        .legacy_cftypes = pids_files,
        .dfl_cftypes    = pids_files,
        .threaded       = true,
index d503d1a9007c9e1d01928c8e7effe5c9f0511753..bb95a35e8c2d0785ec7ce2acfb2c64a74e5935a2 100644 (file)
@@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
                                                   struct cgroup *root, int cpu)
 {
        struct cgroup_rstat_cpu *rstatc;
-       struct cgroup *parent;
 
        if (pos == root)
                return NULL;
@@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
         * However, due to the way we traverse, @pos will be the first
         * child in most cases. The only exception is @root.
         */
-       parent = cgroup_parent(pos);
-       if (parent && rstatc->updated_next) {
+       if (rstatc->updated_next) {
+               struct cgroup *parent = cgroup_parent(pos);
                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
                struct cgroup_rstat_cpu *nrstatc;
                struct cgroup **nextp;
@@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
                 * updated stat.
                 */
                smp_mb();
+
+               return pos;
        }
 
-       return pos;
+       /* only happens for @root */
+       return NULL;
 }
 
 /* see cgroup_rstat_flush() */
index 21f4a97085b4041bcbe9b23568aec7bed935f21f..45d77284aed0a91383e71e53bfadf6dd5017f7e0 100644 (file)
@@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
        if (cred->magic != CRED_MAGIC)
                return true;
-#ifdef CONFIG_SECURITY_SELINUX
-       /*
-        * cred->security == NULL if security_cred_alloc_blank() or
-        * security_prepare_creds() returned an error.
-        */
-       if (selinux_is_enabled() && cred->security) {
-               if ((unsigned long) cred->security < PAGE_SIZE)
-                       return true;
-               if ((*(u32 *)cred->security & 0xffffff00) ==
-                   (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
-                       return true;
-       }
-#endif
        return false;
 }
 EXPORT_SYMBOL(creds_are_invalid);
index ca88b867e7fea00bec11fb43c9448c367a17adfe..0711d18645de3d666ee4d52a8b4411b33593a92f 100644 (file)
@@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT
 config ARCH_HAS_DMA_COHERENCE_H
        bool
 
+config ARCH_HAS_DMA_SET_MASK
+       bool
+
 config HAVE_GENERIC_DMA_COHERENT
        bool
 
index 355d16acee6dd17aa7998d1ea55a6abcb3e01e92..d5bb51cf27c6543102265123e9887e42722e4fa1 100644 (file)
@@ -132,8 +132,7 @@ again:
                        goto again;
                }
 
-               if (IS_ENABLED(CONFIG_ZONE_DMA) &&
-                   phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
+               if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
                        gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
                        goto again;
                }
index a11006b6d8e87d5f64db77e777da08355a32ad8d..ef2aba503467efcf91fcc60c124893c308d03e75 100644 (file)
@@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(dma_mmap_attrs);
 
-#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
 static u64 dma_default_get_required_mask(struct device *dev)
 {
        u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
@@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev)
        return dma_default_get_required_mask(dev);
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#endif
 
 #ifndef arch_dma_alloc_attrs
 #define arch_dma_alloc_attrs(dev)      (true)
@@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-#ifndef HAVE_ARCH_DMA_SET_MASK
+#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
+void arch_dma_set_mask(struct device *dev, u64 mask);
+#else
+#define arch_dma_set_mask(dev, mask)   do { } while (0)
+#endif
+
 int dma_set_mask(struct device *dev, u64 mask)
 {
        if (!dev->dma_mask || !dma_supported(dev, mask))
                return -EIO;
 
+       arch_dma_set_mask(dev, mask);
        dma_check_mask(dev, mask);
        *dev->dma_mask = mask;
        return 0;
 }
 EXPORT_SYMBOL(dma_set_mask);
-#endif
 
 #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
index 1fb6fd68b9c7e80c969072642a4064f4e9608008..6d0236bd3929c4e6f7092d902c92e9fc893526a0 100644 (file)
@@ -650,15 +650,3 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 
        return true;
 }
-
-/*
- * Return whether the given device DMA address mask can be supported
- * properly.  For example, if your device can only drive the low 24-bits
- * during bus mastering, then you would pass 0x00ffffff as the mask to
- * this function.
- */
-int
-swiotlb_dma_supported(struct device *hwdev, u64 mask)
-{
-       return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
-}
index 2639a30a8aa5dd9054bc0af5951397efb1083713..2166c2d92ddc0c8a0af6e5a1dd833fe0243fd06a 100644 (file)
@@ -219,6 +219,7 @@ repeat:
        }
 
        write_unlock_irq(&tasklist_lock);
+       cgroup_release(p);
        release_thread(p);
        call_rcu(&p->rcu, delayed_put_task_struct);
 
index 915c02e8e5dd28ff36e7555be769b6e5fa77e309..e81b17b53fa53aa025ccb8f12c236158cb2e61b2 100644 (file)
@@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg,
                                     arg, func);
 }
 
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
-
 /*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
@@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        return ret;
 }
 
-#endif
-
 static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
 {
        return 1;
index a43c601ac252a7591c0464c9ef58162c94285ae0..54a0347ca8128f09cdbbcc83e2e8f8eea633a7ab 100644 (file)
@@ -445,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
         * behavior of privileged children.
         */
        if (!task_no_new_privs(current) &&
-           security_capable_noaudit(current_cred(), current_user_ns(),
-                                    CAP_SYS_ADMIN) != 0)
+           security_capable(current_cred(), current_user_ns(),
+                                    CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
                return ERR_PTR(-EACCES);
 
        /* Allocate a new seccomp_filter */
index dc5d9e636d48575b308eba8ffc4c5a108a4e1c38..12df0e5434b845abe7c5a6d98beb7627503af792 100644 (file)
@@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
                new->uid = kruid;
                if (!uid_eq(old->uid, kruid) &&
                    !uid_eq(old->euid, kruid) &&
-                   !ns_capable(old->user_ns, CAP_SETUID))
+                   !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }
 
@@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
                if (!uid_eq(old->uid, keuid) &&
                    !uid_eq(old->euid, keuid) &&
                    !uid_eq(old->suid, keuid) &&
-                   !ns_capable(old->user_ns, CAP_SETUID))
+                   !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }
 
@@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid)
        old = current_cred();
 
        retval = -EPERM;
-       if (ns_capable(old->user_ns, CAP_SETUID)) {
+       if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
                new->suid = new->uid = kuid;
                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
@@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
        old = current_cred();
 
        retval = -EPERM;
-       if (!ns_capable(old->user_ns, CAP_SETUID)) {
+       if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
                if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
                    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
                        goto error;
@@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid)
 
        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
-           ns_capable(old->user_ns, CAP_SETUID)) {
+           ns_capable_setid(old->user_ns, CAP_SETUID)) {
                if (!uid_eq(kuid, old->fsuid)) {
                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
index 27821480105e6fc86ef3d75a4d4abaf7a1fa0e69..217ef481fbbb68261fab5468c63a5bba4faee099 100644 (file)
@@ -1301,7 +1301,7 @@ static int parse_pred(const char *str, void *data,
                /* go past the last quote */
                i++;
 
-       } else if (isdigit(str[i])) {
+       } else if (isdigit(str[i]) || str[i] == '-') {
 
                /* Make sure the field is not a string */
                if (is_string_field(field)) {
@@ -1314,6 +1314,9 @@ static int parse_pred(const char *str, void *data,
                        goto err_free;
                }
 
+               if (str[i] == '-')
+                       i++;
+
                /* We allow 0xDEADBEEF */
                while (isalnum(str[i]))
                        i++;
index 9eaf07f99212f797df29fb5f0b93f475c48f441f..99592c27465e1156a0dce1dd2d953aef9ef3e8c8 100644 (file)
@@ -865,7 +865,7 @@ fetch_store_strlen(unsigned long addr)
        u8 c;
 
        do {
-               ret = probe_mem_read(&c, (u8 *)addr + len, 1);
+               ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
                len++;
        } while (c && ret == 0 && len < MAX_STRING_SIZE);
 
index d51c37dd9422967875aa6e393580b65bf38d1fb8..7abbeed1342109cc8ac83d36139282323d75c106 100644 (file)
@@ -648,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
-        * reordering can lead to a missed execution on attempt to qeueue
+        * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
@@ -1353,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
 
        worker = current_wq_worker();
        /*
-        * Return %true iff I'm a worker execuing a work item on @wq.  If
+        * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
@@ -1735,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
  *
  * Return: %false if @rwork was already pending, %true otherwise.  Note
  * that a full RCU grace period is guaranteed only after a %true return.
- * While @rwork is guarnateed to be executed after a %false return, the
+ * While @rwork is guaranteed to be executed after a %false return, the
  * execution may happen before a full RCU grace period has passed.
  */
 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
@@ -3027,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
        if (WARN_ON(!wq_online))
                return false;
 
+       if (WARN_ON(!work->func))
+               return false;
+
        if (!from_cancel) {
                lock_map_acquire(&work->lockdep_map);
                lock_map_release(&work->lockdep_map);
index 22291db50013d89d9427a2c50003dca0a25abbcf..f84e22685aaaaa7ff1167697af36a16960171a7d 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1939,7 +1939,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
  * Check if it's allowed to use __get_user_pages_fast() for the range, or
  * we need to fall back to the slow version:
  */
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+bool gup_fast_permitted(unsigned long start, int nr_pages)
 {
        unsigned long len, end;
 
@@ -1981,7 +1981,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
         * block IPIs that come from THPs splitting.
         */
 
-       if (gup_fast_permitted(start, nr_pages, write)) {
+       if (gup_fast_permitted(start, nr_pages)) {
                local_irq_save(flags);
                gup_pgd_range(start, end, write, pages, &nr);
                local_irq_restore(flags);
@@ -2023,7 +2023,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
        if (unlikely(!access_ok((void __user *)start, len)))
                return -EFAULT;
 
-       if (gup_fast_permitted(start, nr_pages, write)) {
+       if (gup_fast_permitted(start, nr_pages)) {
                local_irq_disable();
                gup_pgd_range(addr, end, write, pages, &nr);
                local_irq_enable();
index 0f643dc2dc658695effd36a96a15fa869f59ad93..b68d5df147317295514e61ccae2b9f65f5311895 100644 (file)
@@ -67,7 +67,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
                pcpu_set_page_chunk(nth_page(pages, i), chunk);
 
        chunk->data = pages;
-       chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+       chunk->base_addr = page_address(pages);
 
        spin_lock_irqsave(&pcpu_lock, flags);
        pcpu_chunk_populated(chunk, 0, nr_pages, false);
index db86282fd024580cbf5c41f01cb6d5447a9e1791..c5c750781628d7a793488dc8fd7d62019932bf90 100644 (file)
@@ -2384,7 +2384,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;
 
-       for (group = 0, unit = 0; group_cnt[group]; group++) {
+       for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
 
                /*
index e4fe2f3c2c65a00eb8ea794178c73998b0e567de..1d6463fb1450c03b8739b102b48b64e05aaa533e 100644 (file)
@@ -40,8 +40,7 @@ config SECURITYFS
        bool "Enable the securityfs filesystem"
        help
          This will build the securityfs filesystem.  It is currently used by
-         the TPM bios character driver and IMA, an integrity provider.  It is
-         not used by SELinux or SMACK.
+         various security modules (AppArmor, IMA, SafeSetID, TOMOYO, TPM).
 
          If you are unsure how to answer this question, answer N.
 
@@ -236,45 +235,19 @@ source "security/tomoyo/Kconfig"
 source "security/apparmor/Kconfig"
 source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
+source "security/safesetid/Kconfig"
 
 source "security/integrity/Kconfig"
 
-choice
-       prompt "Default security module"
-       default DEFAULT_SECURITY_SELINUX if SECURITY_SELINUX
-       default DEFAULT_SECURITY_SMACK if SECURITY_SMACK
-       default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO
-       default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR
-       default DEFAULT_SECURITY_DAC
-
+config LSM
+       string "Ordered list of enabled LSMs"
+       default "yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
        help
-         Select the security module that will be used by default if the
-         kernel parameter security= is not specified.
-
-       config DEFAULT_SECURITY_SELINUX
-               bool "SELinux" if SECURITY_SELINUX=y
-
-       config DEFAULT_SECURITY_SMACK
-               bool "Simplified Mandatory Access Control" if SECURITY_SMACK=y
-
-       config DEFAULT_SECURITY_TOMOYO
-               bool "TOMOYO" if SECURITY_TOMOYO=y
-
-       config DEFAULT_SECURITY_APPARMOR
-               bool "AppArmor" if SECURITY_APPARMOR=y
-
-       config DEFAULT_SECURITY_DAC
-               bool "Unix Discretionary Access Controls"
-
-endchoice
+         A comma-separated list of LSMs, in initialization order.
+         Any LSMs left off this list will be ignored. This can be
+         controlled at boot with the "lsm=" parameter.
 
-config DEFAULT_SECURITY
-       string
-       default "selinux" if DEFAULT_SECURITY_SELINUX
-       default "smack" if DEFAULT_SECURITY_SMACK
-       default "tomoyo" if DEFAULT_SECURITY_TOMOYO
-       default "apparmor" if DEFAULT_SECURITY_APPARMOR
-       default "" if DEFAULT_SECURITY_DAC
+         If unsure, leave this as the default.
 
 endmenu
 
index 4d2d3782ddefd3fbdd6d2984a1faa8a2bd6561ae..c598b904938f90cd24fe170e9d21ed16e6caaf46 100644 (file)
@@ -10,6 +10,7 @@ subdir-$(CONFIG_SECURITY_TOMOYO)        += tomoyo
 subdir-$(CONFIG_SECURITY_APPARMOR)     += apparmor
 subdir-$(CONFIG_SECURITY_YAMA)         += yama
 subdir-$(CONFIG_SECURITY_LOADPIN)      += loadpin
+subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
 
 # always enable default capabilities
 obj-y                                  += commoncap.o
@@ -25,6 +26,7 @@ obj-$(CONFIG_SECURITY_TOMOYO)         += tomoyo/
 obj-$(CONFIG_SECURITY_APPARMOR)                += apparmor/
 obj-$(CONFIG_SECURITY_YAMA)            += yama/
 obj-$(CONFIG_SECURITY_LOADPIN)         += loadpin/
+obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
 obj-$(CONFIG_CGROUP_DEVICE)            += device_cgroup.o
 
 # Object integrity file lists
index b6b68a7750ce36597ecba044f82385c8e6efc71f..3de21f46c82af95176f379c5fb4c22d721bbd82d 100644 (file)
@@ -14,22 +14,6 @@ config SECURITY_APPARMOR
 
          If you are unsure how to answer this question, answer N.
 
-config SECURITY_APPARMOR_BOOTPARAM_VALUE
-       int "AppArmor boot parameter default value"
-       depends on SECURITY_APPARMOR
-       range 0 1
-       default 1
-       help
-         This option sets the default value for the kernel parameter
-         'apparmor', which allows AppArmor to be enabled or disabled
-          at boot.  If this option is set to 0 (zero), the AppArmor
-         kernel parameter will default to 0, disabling AppArmor at
-         boot.  If this option is set to 1 (one), the AppArmor
-         kernel parameter will default to 1, enabling AppArmor at
-         boot.
-
-         If you are unsure how to answer this question, answer 1.
-
 config SECURITY_APPARMOR_HASH
        bool "Enable introspection of sha1 hashes for loaded profiles"
        depends on SECURITY_APPARMOR
index eeaddfe0c0fb9c3999453451de1899db81a9ad84..5a8b9cded4f269a963a6dd3528689bb89e5759f8 100644 (file)
@@ -225,8 +225,7 @@ int aa_audit_rule_known(struct audit_krule *rule)
        return 0;
 }
 
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
-                       struct audit_context *actx)
+int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
 {
        struct aa_audit_rule *rule = vrule;
        struct aa_label *label;
index 253ef6e9d445355c0f1d3379d319a8e052541032..752f73980e3085e81daceccc269f45e09b812b3c 100644 (file)
@@ -110,13 +110,13 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile,
  * profile_capable - test if profile allows use of capability @cap
  * @profile: profile being enforced    (NOT NULL, NOT unconfined)
  * @cap: capability to test if allowed
- * @audit: whether an audit record should be generated
+ * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated
  * @sa: audit data (MAY BE NULL indicating no auditing)
  *
  * Returns: 0 if allowed else -EPERM
  */
-static int profile_capable(struct aa_profile *profile, int cap, int audit,
-                          struct common_audit_data *sa)
+static int profile_capable(struct aa_profile *profile, int cap,
+                          unsigned int opts, struct common_audit_data *sa)
 {
        int error;
 
@@ -126,7 +126,7 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit,
        else
                error = -EPERM;
 
-       if (audit == SECURITY_CAP_NOAUDIT) {
+       if (opts & CAP_OPT_NOAUDIT) {
                if (!COMPLAIN_MODE(profile))
                        return error;
                /* audit the cap request in complain mode but note that it
@@ -142,13 +142,13 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit,
  * aa_capable - test permission to use capability
  * @label: label being tested for capability (NOT NULL)
  * @cap: capability to be tested
- * @audit: whether an audit record should be generated
+ * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated
  *
  * Look up capability in profile capability set.
  *
  * Returns: 0 on success, or else an error code.
  */
-int aa_capable(struct aa_label *label, int cap, int audit)
+int aa_capable(struct aa_label *label, int cap, unsigned int opts)
 {
        struct aa_profile *profile;
        int error = 0;
@@ -156,7 +156,7 @@ int aa_capable(struct aa_label *label, int cap, int audit)
 
        sa.u.cap = cap;
        error = fn_for_each_confined(label, profile,
-                       profile_capable(profile, cap, audit, &sa));
+                       profile_capable(profile, cap, opts, &sa));
 
        return error;
 }
index 11975ec8d5665957d145ae67f31a03bb41fc77e4..ca2dccf5b445e0ce68777320f681b87804c2c369 100644 (file)
@@ -572,7 +572,7 @@ static struct aa_label *x_to_label(struct aa_profile *profile,
                        stack = NULL;
                        break;
                }
-               /* fall through to X_NAME */
+               /* fall through to X_NAME */
        case AA_X_NAME:
                if (xindex & AA_X_CHILD)
                        /* released by caller */
@@ -975,7 +975,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
        }
        aa_put_label(cred_label(bprm->cred));
        /* transfer reference, released when cred is freed */
-       cred_label(bprm->cred) = new;
+       set_cred_label(bprm->cred, new);
 
 done:
        aa_put_label(label);
index b8c8b1066b0a126ad4ac50f343a721cbabcd8748..ee559bc2acb86c330a7ab545f89aa19ba1313b6e 100644 (file)
@@ -192,7 +192,6 @@ static inline int complain_error(int error)
 void aa_audit_rule_free(void *vrule);
 int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule);
 int aa_audit_rule_known(struct audit_krule *rule);
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
-                       struct audit_context *actx);
+int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule);
 
 #endif /* __AA_AUDIT_H */
index e0304e2aeb7ffda8fe6862c235526c22dd10db20..1b3663b6ab129d649cd7d9604ae4e4cb2a51ef7b 100644 (file)
@@ -40,7 +40,7 @@ struct aa_caps {
 
 extern struct aa_sfs_entry aa_sfs_entry_caps[];
 
-int aa_capable(struct aa_label *label, int cap, int audit);
+int aa_capable(struct aa_label *label, int cap, unsigned int opts);
 
 static inline void aa_free_cap_rules(struct aa_caps *caps)
 {
index 265ae6641a0644e84e40c7f774f4f447a1df3fc2..b9504a05fddcbbc14561431fa0d1c8c5c4cc7b5f 100644 (file)
 #include "policy_ns.h"
 #include "task.h"
 
-#define cred_label(X) ((X)->security)
+static inline struct aa_label *cred_label(const struct cred *cred)
+{
+       struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;
+
+       AA_BUG(!blob);
+       return *blob;
+}
 
+static inline void set_cred_label(const struct cred *cred,
+                                 struct aa_label *label)
+{
+       struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;
+
+       AA_BUG(!blob);
+       *blob = label;
+}
 
 /**
  * aa_cred_raw_label - obtain cred's label
index 4c2c8ac8842f6d8c05c4480fa2680b4ba9ab75df..8be09208cf7c119ebd0f20e494c9f466fc74c15a 100644 (file)
@@ -32,7 +32,10 @@ struct path;
                                 AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \
                                 AA_EXEC_MMAP | AA_MAY_LINK)
 
-#define file_ctx(X) ((struct aa_file_ctx *)(X)->f_security)
+static inline struct aa_file_ctx *file_ctx(struct file *file)
+{
+       return file->f_security + apparmor_blob_sizes.lbs_file;
+}
 
 /* struct aa_file_ctx - the AppArmor context the file was opened in
  * @lock: lock to update the ctx
index 6505e1ad9e230605885f20f1e6f8df2029866cf2..bbe9b384d71d19684e5de3e041b3596346a922ff 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/lsm_hooks.h>
 
 #include "match.h"
 
@@ -55,6 +56,9 @@ const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name,
                             size_t *ns_len);
 void aa_info_message(const char *str);
 
+/* Security blob offsets */
+extern struct lsm_blob_sizes apparmor_blob_sizes;
+
 /**
  * aa_strneq - compare null terminated @str to a non null terminated substring
  * @str: a null terminated string
index 55edaa1d83f89c68142c3a00c996a90c21f7b021..311e652324e3a9395c36d11af474b2488bc9d4c2 100644 (file)
 #ifndef __AA_TASK_H
 #define __AA_TASK_H
 
-#define task_ctx(X) ((X)->security)
+static inline struct aa_task_ctx *task_ctx(struct task_struct *task)
+{
+       return task->security + apparmor_blob_sizes.lbs_task;
+}
 
 /*
  * struct aa_task_ctx - information for current task label change
@@ -36,17 +39,6 @@ int aa_set_current_hat(struct aa_label *label, u64 token);
 int aa_restore_previous_label(u64 cookie);
 struct aa_label *aa_get_task_label(struct task_struct *task);
 
-/**
- * aa_alloc_task_ctx - allocate a new task_ctx
- * @flags: gfp flags for allocation
- *
- * Returns: allocated buffer or NULL on failure
- */
-static inline struct aa_task_ctx *aa_alloc_task_ctx(gfp_t flags)
-{
-       return kzalloc(sizeof(struct aa_task_ctx), flags);
-}
-
 /**
  * aa_free_task_ctx - free a task_ctx
  * @ctx: task_ctx to free (MAYBE NULL)
@@ -57,8 +49,6 @@ static inline void aa_free_task_ctx(struct aa_task_ctx *ctx)
                aa_put_label(ctx->nnp);
                aa_put_label(ctx->previous);
                aa_put_label(ctx->onexec);
-
-               kzfree(ctx);
        }
 }
 
index 527ea1557120ece44aa0c04d901bfa3e8d9b9422..aacd1e95cb596888fec63a00de1124092c11a188 100644 (file)
@@ -107,7 +107,8 @@ static int profile_tracer_perm(struct aa_profile *tracer,
        aad(sa)->label = &tracer->label;
        aad(sa)->peer = tracee;
        aad(sa)->request = 0;
-       aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1);
+       aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE,
+                                   CAP_OPT_NONE);
 
        return aa_audit(AUDIT_APPARMOR_AUTO, tracer, sa, audit_ptrace_cb);
 }
index 8db1731d046ad0b55f594266472308edd0b8217e..49d664ddff444810ef9c6e8a1b0276c5ba473c53 100644 (file)
@@ -60,7 +60,7 @@ DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
 static void apparmor_cred_free(struct cred *cred)
 {
        aa_put_label(cred_label(cred));
-       cred_label(cred) = NULL;
+       set_cred_label(cred, NULL);
 }
 
 /*
@@ -68,7 +68,7 @@ static void apparmor_cred_free(struct cred *cred)
  */
 static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
-       cred_label(cred) = NULL;
+       set_cred_label(cred, NULL);
        return 0;
 }
 
@@ -78,7 +78,7 @@ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
                                 gfp_t gfp)
 {
-       cred_label(new) = aa_get_newest_label(cred_label(old));
+       set_cred_label(new, aa_get_newest_label(cred_label(old)));
        return 0;
 }
 
@@ -87,26 +87,21 @@ static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
  */
 static void apparmor_cred_transfer(struct cred *new, const struct cred *old)
 {
-       cred_label(new) = aa_get_newest_label(cred_label(old));
+       set_cred_label(new, aa_get_newest_label(cred_label(old)));
 }
 
 static void apparmor_task_free(struct task_struct *task)
 {
 
        aa_free_task_ctx(task_ctx(task));
-       task_ctx(task) = NULL;
 }
 
 static int apparmor_task_alloc(struct task_struct *task,
                               unsigned long clone_flags)
 {
-       struct aa_task_ctx *new = aa_alloc_task_ctx(GFP_KERNEL);
-
-       if (!new)
-               return -ENOMEM;
+       struct aa_task_ctx *new = task_ctx(task);
 
        aa_dup_task_ctx(new, task_ctx(current));
-       task_ctx(task) = new;
 
        return 0;
 }
@@ -177,14 +172,14 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective,
 }
 
 static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
-                           int cap, int audit)
+                           int cap, unsigned int opts)
 {
        struct aa_label *label;
        int error = 0;
 
        label = aa_get_newest_cred_label(cred);
        if (!unconfined(label))
-               error = aa_capable(label, cap, audit);
+               error = aa_capable(label, cap, opts);
        aa_put_label(label);
 
        return error;
@@ -434,21 +429,21 @@ static int apparmor_file_open(struct file *file)
 
 static int apparmor_file_alloc_security(struct file *file)
 {
-       int error = 0;
-
-       /* freed by apparmor_file_free_security */
+       struct aa_file_ctx *ctx = file_ctx(file);
        struct aa_label *label = begin_current_label_crit_section();
-       file->f_security = aa_alloc_file_ctx(label, GFP_KERNEL);
-       if (!file_ctx(file))
-               error = -ENOMEM;
-       end_current_label_crit_section(label);
 
-       return error;
+       spin_lock_init(&ctx->lock);
+       rcu_assign_pointer(ctx->label, aa_get_label(label));
+       end_current_label_crit_section(label);
+       return 0;
 }
 
 static void apparmor_file_free_security(struct file *file)
 {
-       aa_free_file_ctx(file_ctx(file));
+       struct aa_file_ctx *ctx = file_ctx(file);
+
+       if (ctx)
+               aa_put_label(rcu_access_pointer(ctx->label));
 }
 
 static int common_file_perm(const char *op, struct file *file, u32 mask)
@@ -1151,6 +1146,15 @@ static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 }
 #endif
 
+/*
+ * The cred blob is a pointer to, not an instance of, an aa_task_ctx.
+ */
+struct lsm_blob_sizes apparmor_blob_sizes __lsm_ro_after_init = {
+       .lbs_cred = sizeof(struct aa_task_ctx *),
+       .lbs_file = sizeof(struct aa_file_ctx),
+       .lbs_task = sizeof(struct aa_task_ctx),
+};
+
 static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
@@ -1333,8 +1337,8 @@ bool aa_g_paranoid_load = true;
 module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO);
 
 /* Boot time disable flag */
-static bool apparmor_enabled = CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE;
-module_param_named(enabled, apparmor_enabled, bool, S_IRUGO);
+static int apparmor_enabled __lsm_ro_after_init = 1;
+module_param_named(enabled, apparmor_enabled, int, 0444);
 
 static int __init apparmor_enabled_setup(char *str)
 {
@@ -1479,14 +1483,8 @@ static int param_set_mode(const char *val, const struct kernel_param *kp)
 static int __init set_init_ctx(void)
 {
        struct cred *cred = (struct cred *)current->real_cred;
-       struct aa_task_ctx *ctx;
-
-       ctx = aa_alloc_task_ctx(GFP_KERNEL);
-       if (!ctx)
-               return -ENOMEM;
 
-       cred_label(cred) = aa_get_label(ns_unconfined(root_ns));
-       task_ctx(current) = ctx;
+       set_cred_label(cred, aa_get_label(ns_unconfined(root_ns)));
 
        return 0;
 }
@@ -1665,12 +1663,6 @@ static int __init apparmor_init(void)
 {
        int error;
 
-       if (!apparmor_enabled || !security_module_enable("apparmor")) {
-               aa_info_message("AppArmor disabled by boot time parameter");
-               apparmor_enabled = false;
-               return 0;
-       }
-
        aa_secids_init();
 
        error = aa_setup_dfa_engine();
@@ -1731,5 +1723,8 @@ alloc_out:
 
 DEFINE_LSM(apparmor) = {
        .name = "apparmor",
+       .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
+       .enabled = &apparmor_enabled,
+       .blobs = &apparmor_blob_sizes,
        .init = apparmor_init,
 };
index 95fd26d09757f2c7fb7b39d541e1aa6fdc923a5d..552ed09cb47e75829a60e202b3a9465cecefe246 100644 (file)
@@ -124,7 +124,7 @@ int aa_task_setrlimit(struct aa_label *label, struct task_struct *task,
         */
 
        if (label != peer &&
-           aa_capable(label, CAP_SYS_RESOURCE, SECURITY_CAP_NOAUDIT) != 0)
+           aa_capable(label, CAP_SYS_RESOURCE, CAP_OPT_NOAUDIT) != 0)
                error = fn_for_each(label, profile,
                                audit_resource(profile, resource,
                                               new_rlim->rlim_max, peer,
index c6b78a14da91842a7288e4c92e6db490284f0c4c..4551110f049694311ab5958770a94cac2331ab9b 100644 (file)
@@ -81,7 +81,7 @@ int aa_replace_current_label(struct aa_label *label)
         */
        aa_get_label(label);
        aa_put_label(cred_label(new));
-       cred_label(new) = label;
+       set_cred_label(new, label);
 
        commit_creds(new);
        return 0;
@@ -138,7 +138,7 @@ int aa_set_current_hat(struct aa_label *label, u64 token)
                return -EACCES;
        }
 
-       cred_label(new) = aa_get_newest_label(label);
+       set_cred_label(new, aa_get_newest_label(label));
        /* clear exec on switching context */
        aa_put_label(ctx->onexec);
        ctx->onexec = NULL;
@@ -172,7 +172,7 @@ int aa_restore_previous_label(u64 token)
                return -ENOMEM;
 
        aa_put_label(cred_label(new));
-       cred_label(new) = aa_get_newest_label(ctx->previous);
+       set_cred_label(new, aa_get_newest_label(ctx->previous));
        AA_BUG(!cred_label(new));
        /* clear exec && prev information when restoring to previous context */
        aa_clear_task_ctx_trans(ctx);
index 232db019f0519f6af292eb5db97d300856eb1bd9..c477fb673701a11b3cb2577be4138ab579d11de3 100644 (file)
@@ -57,7 +57,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
  * @cred: The credentials to use
  * @ns:  The user namespace in which we need the capability
  * @cap: The capability to check for
- * @audit: Whether to write an audit message or not
+ * @opts: Bitmask of options defined in include/linux/security.h
  *
  * Determine whether the nominated task has the specified capability amongst
  * its effective set, returning 0 if it does, -ve if it does not.
@@ -68,7 +68,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
  * kernel's capable() and has_capability() returns 1 for this case.
  */
 int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
-               int cap, int audit)
+               int cap, unsigned int opts)
 {
        struct user_namespace *ns = targ_ns;
 
@@ -222,12 +222,11 @@ int cap_capget(struct task_struct *target, kernel_cap_t *effective,
  */
 static inline int cap_inh_is_capped(void)
 {
-
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
-                       CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0)
+                       CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
 }
@@ -643,6 +642,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
        cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
 
+       cpu_caps->rootid = rootkuid;
+
        return 0;
 }
 
@@ -1208,8 +1209,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))   /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))   /*[3]*/
                    || (cap_capable(current_cred(),
-                                   current_cred()->user_ns, CAP_SETPCAP,
-                                   SECURITY_CAP_AUDIT) != 0)           /*[4]*/
+                                   current_cred()->user_ns,
+                                   CAP_SETPCAP,
+                                   CAP_OPT_NONE) != 0)                 /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
@@ -1304,9 +1306,10 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
        int cap_sys_admin = 0;
 
-       if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
-                       SECURITY_CAP_NOAUDIT) == 0)
+       if (cap_capable(current_cred(), &init_user_ns,
+                               CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;
+
        return cap_sys_admin;
 }
 
@@ -1325,7 +1328,7 @@ int cap_mmap_addr(unsigned long addr)
 
        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
-                                 SECURITY_CAP_AUDIT);
+                                 CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
@@ -1362,10 +1365,17 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
 };
 
-void __init capability_add_hooks(void)
+static int __init capability_init(void)
 {
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                                "capability");
+       return 0;
 }
 
+DEFINE_LSM(capability) = {
+       .name = "capability",
+       .order = LSM_ORDER_FIRST,
+       .init = capability_init,
+};
+
 #endif /* CONFIG_SECURITY */
index cc12f3449a728e27030bacb821242ef1c380d746..026163f37ba1e8a0a5842242658e027e6d98588e 100644 (file)
@@ -307,8 +307,7 @@ static inline int security_filter_rule_init(u32 field, u32 op, char *rulestr,
 }
 
 static inline int security_filter_rule_match(u32 secid, u32 field, u32 op,
-                                            void *lsmrule,
-                                            struct audit_context *actx)
+                                            void *lsmrule)
 {
        return -EINVAL;
 }
index a2baa85ea2f57d8a72d8bc872716e0ca987c3550..5fb7127bbe68e154abee6039036482b14c1ea77a 100644 (file)
@@ -114,6 +114,7 @@ static void ima_set_cache_status(struct integrity_iint_cache *iint,
                break;
        case CREDS_CHECK:
                iint->ima_creds_status = status;
+               break;
        case FILE_CHECK:
        case POST_SETATTR:
                iint->ima_file_status = status;
index 8bc8a1c8cb3f8cc3cf03fc3255d568438baa580f..e0cc323f948f465bc2343c3da7a113c0138f045a 100644 (file)
@@ -340,8 +340,7 @@ retry:
                        rc = security_filter_rule_match(osid,
                                                        rule->lsm[i].type,
                                                        Audit_equal,
-                                                       rule->lsm[i].rule,
-                                                       NULL);
+                                                       rule->lsm[i].rule);
                        break;
                case LSM_SUBJ_USER:
                case LSM_SUBJ_ROLE:
@@ -349,8 +348,7 @@ retry:
                        rc = security_filter_rule_match(secid,
                                                        rule->lsm[i].type,
                                                        Audit_equal,
-                                                       rule->lsm[i].rule,
-                                                       NULL);
+                                                       rule->lsm[i].rule);
                default:
                        break;
                }
@@ -938,10 +936,12 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
                case Opt_uid_gt:
                case Opt_euid_gt:
                        entry->uid_op = &uid_gt;
+                       /* fall through */
                case Opt_uid_lt:
                case Opt_euid_lt:
                        if ((token == Opt_uid_lt) || (token == Opt_euid_lt))
                                entry->uid_op = &uid_lt;
+                       /* fall through */
                case Opt_uid_eq:
                case Opt_euid_eq:
                        uid_token = (token == Opt_uid_eq) ||
@@ -970,9 +970,11 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
                        break;
                case Opt_fowner_gt:
                        entry->fowner_op = &uid_gt;
+                       /* fall through */
                case Opt_fowner_lt:
                        if (token == Opt_fowner_lt)
                                entry->fowner_op = &uid_lt;
+                       /* fall through */
                case Opt_fowner_eq:
                        ima_log_string_op(ab, "fowner", args[0].from,
                                          entry->fowner_op);
index 43752002c2223ca0315237b3dc2bf5df9ac5c4c0..513b457ae900ea00cff17247f324d8b966d98246 100644 (file)
@@ -83,6 +83,7 @@ static void ima_show_template_data_ascii(struct seq_file *m,
                /* skip ':' and '\0' */
                buf_ptr += 2;
                buflen -= buf_ptr - field_data->data;
+               /* fall through */
        case DATA_FMT_DIGEST:
        case DATA_FMT_HEX:
                if (!buflen)
index 7bbe03593e581116c0a9678a9e50788fdbac80ed..3e4053a217c326bc92f7c9488293bd2f45f2b974 100644 (file)
@@ -1752,7 +1752,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        return -EINVAL;
                return keyctl_pkey_query((key_serial_t)arg2,
                                         (const char __user *)arg4,
-                                        (struct keyctl_pkey_query *)arg5);
+                                        (struct keyctl_pkey_query __user *)arg5);
 
        case KEYCTL_PKEY_ENCRYPT:
        case KEYCTL_PKEY_DECRYPT:
index f81372f53dd706ba966d89c1dffb9ab1fe1b63e7..e14f09e3a4b027376bcf6f78f1464a4374cb4550 100644 (file)
@@ -246,6 +246,7 @@ static unsigned long keyring_get_key_chunk(const void *data, int level)
                                    (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
                n--;
                offset = 1;
+               /* fall through */
        default:
                offset += sizeof(chunk) - 1;
                offset += (level - 3) * sizeof(chunk);
index 0e0b9ccad2f882f8f62540da055740f88383e2d4..9320424c4a462b751c5a09e8e67240d709f6413d 100644 (file)
@@ -380,6 +380,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
+                       /* fall through */
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
@@ -404,6 +405,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
+                       /* fall through */
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
@@ -424,6 +426,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
                case -EAGAIN: /* no key */
                        if (ret)
                                break;
+                       /* fall through */
                case -ENOKEY: /* negative key */
                        ret = key_ref;
                        break;
index 7a0c6b666ff03a5e59e979a70fd14ed808b46861..2f17d84d46f1a52df1dad31d2e818358055672fc 100644 (file)
@@ -273,16 +273,19 @@ static int construct_get_dest_keyring(struct key **_dest_keyring)
                                }
                        }
 
+                       /* fall through */
                case KEY_REQKEY_DEFL_THREAD_KEYRING:
                        dest_keyring = key_get(cred->thread_keyring);
                        if (dest_keyring)
                                break;
 
+                       /* fall through */
                case KEY_REQKEY_DEFL_PROCESS_KEYRING:
                        dest_keyring = key_get(cred->process_keyring);
                        if (dest_keyring)
                                break;
 
+                       /* fall through */
                case KEY_REQKEY_DEFL_SESSION_KEYRING:
                        rcu_read_lock();
                        dest_keyring = key_get(
@@ -292,6 +295,7 @@ static int construct_get_dest_keyring(struct key **_dest_keyring)
                        if (dest_keyring)
                                break;
 
+                       /* fall through */
                case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
                        dest_keyring =
                                key_get(cred->user->session_keyring);
index 48f39631b370a45677f166f8f746b8918c5dfe83..055fb0a6416932c89d7933514f95993bf46b8f68 100644 (file)
@@ -187,13 +187,19 @@ static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(kernel_load_data, loadpin_load_data),
 };
 
-void __init loadpin_add_hooks(void)
+static int __init loadpin_init(void)
 {
        pr_info("ready to pin (currently %senforcing)\n",
                enforce ? "" : "not ");
        security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
+       return 0;
 }
 
+DEFINE_LSM(loadpin) = {
+       .name = "loadpin",
+       .init = loadpin_init,
+};
+
 /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
 module_param(enforce, int, 0);
 MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning");
diff --git a/security/safesetid/Kconfig b/security/safesetid/Kconfig
new file mode 100644 (file)
index 0000000..4f415c4
--- /dev/null
@@ -0,0 +1,14 @@
+config SECURITY_SAFESETID
+        bool "Gate setid transitions to limit CAP_SET{U/G}ID capabilities"
+        depends on SECURITY
+        select SECURITYFS
+        default n
+        help
+          SafeSetID is an LSM module that gates the setid family of syscalls to
+          restrict UID/GID transitions from a given UID/GID to only those
+          approved by a system-wide whitelist. These restrictions also prohibit
+          the given UIDs/GIDs from obtaining auxiliary privileges associated
+          with CAP_SET{U/G}ID, such as allowing a user to set up user namespace
+          UID mappings.
+
+          If you are unsure how to answer this question, answer N.
diff --git a/security/safesetid/Makefile b/security/safesetid/Makefile
new file mode 100644 (file)
index 0000000..6b06603
--- /dev/null
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the safesetid LSM.
+#
+
+obj-$(CONFIG_SECURITY_SAFESETID) := safesetid.o
+safesetid-y := lsm.o securityfs.o
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
new file mode 100644 (file)
index 0000000..cecd38e
--- /dev/null
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "SafeSetID: " fmt
+
+#include <linux/hashtable.h>
+#include <linux/lsm_hooks.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/sched/task_stack.h>
+#include <linux/security.h>
+
+/* Flag indicating whether initialization completed */
+int safesetid_initialized;
+
+#define NUM_BITS 8 /* 128 buckets in hash table */
+
+static DEFINE_HASHTABLE(safesetid_whitelist_hashtable, NUM_BITS);
+
+/*
+ * Hash table entry to store safesetid policy signifying that 'parent' user
+ * can setid to 'child' user.
+ */
+struct entry {
+       struct hlist_node next;
+       struct hlist_node dlist; /* for deletion cleanup */
+       uint64_t parent_kuid;
+       uint64_t child_kuid;
+};
+
+static DEFINE_SPINLOCK(safesetid_whitelist_hashtable_spinlock);
+
+static bool check_setuid_policy_hashtable_key(kuid_t parent)
+{
+       struct entry *entry;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
+                                  entry, next, __kuid_val(parent)) {
+               if (entry->parent_kuid == __kuid_val(parent)) {
+                       rcu_read_unlock();
+                       return true;
+               }
+       }
+       rcu_read_unlock();
+
+       return false;
+}
+
+static bool check_setuid_policy_hashtable_key_value(kuid_t parent,
+                                                   kuid_t child)
+{
+       struct entry *entry;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
+                                  entry, next, __kuid_val(parent)) {
+               if (entry->parent_kuid == __kuid_val(parent) &&
+                   entry->child_kuid == __kuid_val(child)) {
+                       rcu_read_unlock();
+                       return true;
+               }
+       }
+       rcu_read_unlock();
+
+       return false;
+}
+
+static int safesetid_security_capable(const struct cred *cred,
+                                     struct user_namespace *ns,
+                                     int cap,
+                                     unsigned int opts)
+{
+       if (cap == CAP_SETUID &&
+           check_setuid_policy_hashtable_key(cred->uid)) {
+               if (!(opts & CAP_OPT_INSETID)) {
+                       /*
+                        * Deny if we're not in a set*uid() syscall to avoid
+                        * giving powers gated by CAP_SETUID that are related
+                        * to functionality other than calling set*uid() (e.g.
+                        * allowing user to set up userns uid mappings).
+                        */
+                       pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions",
+                               __kuid_val(cred->uid));
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+static int check_uid_transition(kuid_t parent, kuid_t child)
+{
+       if (check_setuid_policy_hashtable_key_value(parent, child))
+               return 0;
+       pr_warn("UID transition (%d -> %d) blocked",
+               __kuid_val(parent),
+               __kuid_val(child));
+       /*
+        * Kill this process to avoid potential security vulnerabilities
+        * that could arise from a missing whitelist entry preventing a
+        * privileged process from dropping to a lesser-privileged one.
+        */
+       force_sig(SIGKILL, current);
+       return -EACCES;
+}
+
+/*
+ * Check whether there is either an exception for user under old cred struct to
+ * set*uid to user under new cred struct, or the UID transition is allowed (by
+ * Linux set*uid rules) even without CAP_SETUID.
+ */
+static int safesetid_task_fix_setuid(struct cred *new,
+                                    const struct cred *old,
+                                    int flags)
+{
+
+       /* Do nothing if there are no setuid restrictions for this UID. */
+       if (!check_setuid_policy_hashtable_key(old->uid))
+               return 0;
+
+       switch (flags) {
+       case LSM_SETID_RE:
+               /*
+                * Users for which setuid restrictions exist can only set the
+                * real UID to the real UID or the effective UID, unless an
+                * explicit whitelist policy allows the transition.
+                */
+               if (!uid_eq(old->uid, new->uid) &&
+                       !uid_eq(old->euid, new->uid)) {
+                       return check_uid_transition(old->uid, new->uid);
+               }
+               /*
+                * Users for which setuid restrictions exist can only set the
+                * effective UID to the real UID, the effective UID, or the
+                * saved set-UID, unless an explicit whitelist policy allows
+                * the transition.
+                */
+               if (!uid_eq(old->uid, new->euid) &&
+                       !uid_eq(old->euid, new->euid) &&
+                       !uid_eq(old->suid, new->euid)) {
+                       return check_uid_transition(old->euid, new->euid);
+               }
+               break;
+       case LSM_SETID_ID:
+               /*
+                * Users for which setuid restrictions exist cannot change the
+                * real UID or saved set-UID unless an explicit whitelist
+                * policy allows the transition.
+                */
+               if (!uid_eq(old->uid, new->uid))
+                       return check_uid_transition(old->uid, new->uid);
+               if (!uid_eq(old->suid, new->suid))
+                       return check_uid_transition(old->suid, new->suid);
+               break;
+       case LSM_SETID_RES:
+               /*
+                * Users for which setuid restrictions exist cannot change the
+                * real UID, effective UID, or saved set-UID to anything but
+                * one of: the current real UID, the current effective UID or
+                * the current saved set-user-ID unless an explicit whitelist
+                * policy allows the transition.
+                */
+               if (!uid_eq(new->uid, old->uid) &&
+                       !uid_eq(new->uid, old->euid) &&
+                       !uid_eq(new->uid, old->suid)) {
+                       return check_uid_transition(old->uid, new->uid);
+               }
+               if (!uid_eq(new->euid, old->uid) &&
+                       !uid_eq(new->euid, old->euid) &&
+                       !uid_eq(new->euid, old->suid)) {
+                       return check_uid_transition(old->euid, new->euid);
+               }
+               if (!uid_eq(new->suid, old->uid) &&
+                       !uid_eq(new->suid, old->euid) &&
+                       !uid_eq(new->suid, old->suid)) {
+                       return check_uid_transition(old->suid, new->suid);
+               }
+               break;
+       case LSM_SETID_FS:
+               /*
+                * Users for which setuid restrictions exist cannot change the
+                * filesystem UID to anything but one of: the current real UID,
+                * the current effective UID or the current saved set-UID
+                * unless an explicit whitelist policy allows the transition.
+                */
+               if (!uid_eq(new->fsuid, old->uid)  &&
+                       !uid_eq(new->fsuid, old->euid)  &&
+                       !uid_eq(new->fsuid, old->suid) &&
+                       !uid_eq(new->fsuid, old->fsuid)) {
+                       return check_uid_transition(old->fsuid, new->fsuid);
+               }
+               break;
+       default:
+               pr_warn("Unknown setid state %d\n", flags);
+               force_sig(SIGKILL, current);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child)
+{
+       struct entry *new;
+
+       /* Return if entry already exists */
+       if (check_setuid_policy_hashtable_key_value(parent, child))
+               return 0;
+
+       new = kzalloc(sizeof(struct entry), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+       new->parent_kuid = __kuid_val(parent);
+       new->child_kuid = __kuid_val(child);
+       spin_lock(&safesetid_whitelist_hashtable_spinlock);
+       hash_add_rcu(safesetid_whitelist_hashtable,
+                    &new->next,
+                    __kuid_val(parent));
+       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
+       return 0;
+}
+
+void flush_safesetid_whitelist_entries(void)
+{
+       struct entry *entry;
+       struct hlist_node *hlist_node;
+       unsigned int bkt_loop_cursor;
+       HLIST_HEAD(free_list);
+
+       /*
+        * Could probably use hash_for_each_rcu here instead, but this should
+        * be fine as well.
+        */
+       spin_lock(&safesetid_whitelist_hashtable_spinlock);
+       hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor,
+                          hlist_node, entry, next) {
+               hash_del_rcu(&entry->next);
+               hlist_add_head(&entry->dlist, &free_list);
+       }
+       spin_unlock(&safesetid_whitelist_hashtable_spinlock);
+       synchronize_rcu();
+       hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) {
+               hlist_del(&entry->dlist);
+               kfree(entry);
+       }
+}
+
+static struct security_hook_list safesetid_security_hooks[] = {
+       LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
+       LSM_HOOK_INIT(capable, safesetid_security_capable)
+};
+
+static int __init safesetid_security_init(void)
+{
+       security_add_hooks(safesetid_security_hooks,
+                          ARRAY_SIZE(safesetid_security_hooks), "safesetid");
+
+       /* Report that SafeSetID successfully initialized */
+       safesetid_initialized = 1;
+
+       return 0;
+}
+
+DEFINE_LSM(safesetid_security_init) = {
+       .init = safesetid_security_init,
+       .name = "safesetid",
+};
diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h
new file mode 100644 (file)
index 0000000..c1ea3c2
--- /dev/null
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+#ifndef _SAFESETID_H
+#define _SAFESETID_H
+
+#include <linux/types.h>
+
+/* Flag indicating whether initialization completed */
+extern int safesetid_initialized;
+
+/* Function type. */
+enum safesetid_whitelist_file_write_type {
+       SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */
+       SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */
+};
+
+/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */
+int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child);
+
+void flush_safesetid_whitelist_entries(void);
+
+#endif /* _SAFESETID_H */
diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
new file mode 100644 (file)
index 0000000..2c6c829
--- /dev/null
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SafeSetID Linux Security Module
+ *
+ * Author: Micah Morton <mortonm@chromium.org>
+ *
+ * Copyright (C) 2018 The Chromium OS Authors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/security.h>
+#include <linux/cred.h>
+
+#include "lsm.h"
+
+static struct dentry *safesetid_policy_dir;
+
+struct safesetid_file_entry {
+       const char *name;
+       enum safesetid_whitelist_file_write_type type;
+       struct dentry *dentry;
+};
+
+static struct safesetid_file_entry safesetid_files[] = {
+       {.name = "add_whitelist_policy",
+        .type = SAFESETID_WHITELIST_ADD},
+       {.name = "flush_whitelist_policies",
+        .type = SAFESETID_WHITELIST_FLUSH},
+};
+
+/*
+ * In the case the input buffer contains one or more invalid UIDs, the kuid_t
+ * variables pointed to by 'parent' and 'child' will get updated but this
+ * function will return an error.
+ */
+static int parse_safesetid_whitelist_policy(const char __user *buf,
+                                           size_t len,
+                                           kuid_t *parent,
+                                           kuid_t *child)
+{
+       char *kern_buf;
+       char *parent_buf;
+       char *child_buf;
+       const char separator[] = ":";
+       int ret;
+       size_t first_substring_length;
+       long parsed_parent;
+       long parsed_child;
+
+       /* Duplicate string from user memory and NULL-terminate */
+       kern_buf = memdup_user_nul(buf, len);
+       if (IS_ERR(kern_buf))
+               return PTR_ERR(kern_buf);
+
+       /*
+        * Format of |buf| string should be <UID>:<UID>.
+        * Find location of ":" in kern_buf (copied from |buf|).
+        */
+       first_substring_length = strcspn(kern_buf, separator);
+       if (first_substring_length == 0 || first_substring_length == len) {
+               ret = -EINVAL;
+               goto free_kern;
+       }
+
+       parent_buf = kmemdup_nul(kern_buf, first_substring_length, GFP_KERNEL);
+       if (!parent_buf) {
+               ret = -ENOMEM;
+               goto free_kern;
+       }
+
+       ret = kstrtol(parent_buf, 0, &parsed_parent);
+       if (ret)
+               goto free_both;
+
+       child_buf = kern_buf + first_substring_length + 1;
+       ret = kstrtol(child_buf, 0, &parsed_child);
+       if (ret)
+               goto free_both;
+
+       *parent = make_kuid(current_user_ns(), parsed_parent);
+       if (!uid_valid(*parent)) {
+               ret = -EINVAL;
+               goto free_both;
+       }
+
+       *child = make_kuid(current_user_ns(), parsed_child);
+       if (!uid_valid(*child)) {
+               ret = -EINVAL;
+               goto free_both;
+       }
+
+free_both:
+       kfree(parent_buf);
+free_kern:
+       kfree(kern_buf);
+       return ret;
+}
+
+static ssize_t safesetid_file_write(struct file *file,
+                                   const char __user *buf,
+                                   size_t len,
+                                   loff_t *ppos)
+{
+       struct safesetid_file_entry *file_entry =
+               file->f_inode->i_private;
+       kuid_t parent;
+       kuid_t child;
+       int ret;
+
+       if (!ns_capable(current_user_ns(), CAP_MAC_ADMIN))
+               return -EPERM;
+
+       if (*ppos != 0)
+               return -EINVAL;
+
+       switch (file_entry->type) {
+       case SAFESETID_WHITELIST_FLUSH:
+               flush_safesetid_whitelist_entries();
+               break;
+       case SAFESETID_WHITELIST_ADD:
+               ret = parse_safesetid_whitelist_policy(buf, len, &parent,
+                                                                &child);
+               if (ret)
+                       return ret;
+
+               ret = add_safesetid_whitelist_entry(parent, child);
+               if (ret)
+                       return ret;
+               break;
+       default:
+               pr_warn("Unknown securityfs file %d\n", file_entry->type);
+               break;
+       }
+
+       /* Return len on success so caller won't keep trying to write */
+       return len;
+}
+
+static const struct file_operations safesetid_file_fops = {
+       .write = safesetid_file_write,
+};
+
+static void safesetid_shutdown_securityfs(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
+               struct safesetid_file_entry *entry =
+                       &safesetid_files[i];
+               securityfs_remove(entry->dentry);
+               entry->dentry = NULL;
+       }
+
+       securityfs_remove(safesetid_policy_dir);
+       safesetid_policy_dir = NULL;
+}
+
+static int __init safesetid_init_securityfs(void)
+{
+       int i;
+       int ret;
+
+       if (!safesetid_initialized)
+               return 0;
+
+       safesetid_policy_dir = securityfs_create_dir("safesetid", NULL);
+       if (IS_ERR(safesetid_policy_dir)) {
+               ret = PTR_ERR(safesetid_policy_dir);
+               goto error;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
+               struct safesetid_file_entry *entry =
+                       &safesetid_files[i];
+               entry->dentry = securityfs_create_file(
+                       entry->name, 0200, safesetid_policy_dir,
+                       entry, &safesetid_file_fops);
+               if (IS_ERR(entry->dentry)) {
+                       ret = PTR_ERR(entry->dentry);
+                       goto error;
+               }
+       }
+
+       return 0;
+
+error:
+       safesetid_shutdown_securityfs();
+       return ret;
+}
+fs_initcall(safesetid_init_securityfs);
index 55bc49027ba97c13c06b4c1076656cef349cee3c..301b141b9a32b6380df7cae74281910b50cae0f4 100644 (file)
 #include <linux/personality.h>
 #include <linux/backing-dev.h>
 #include <linux/string.h>
+#include <linux/msg.h>
 #include <net/flow.h>
 
 #define MAX_LSM_EVM_XATTR      2
 
-/* Maximum number of letters for an LSM name string */
-#define SECURITY_NAME_MAX      10
+/* How many LSMs were built into the kernel? */
+#define LSM_COUNT (__end_lsm_info - __start_lsm_info)
 
 struct security_hook_heads security_hook_heads __lsm_ro_after_init;
 static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain);
 
+static struct kmem_cache *lsm_file_cache;
+static struct kmem_cache *lsm_inode_cache;
+
 char *lsm_names;
+static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;
+
 /* Boot-time LSM user choice */
-static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
-       CONFIG_DEFAULT_SECURITY;
+static __initdata const char *chosen_lsm_order;
+static __initdata const char *chosen_major_lsm;
+
+static __initconst const char * const builtin_lsm_order = CONFIG_LSM;
+
+/* Ordered list of LSMs to initialize. */
+static __initdata struct lsm_info **ordered_lsms;
+static __initdata struct lsm_info *exclusive;
 
 static __initdata bool debug;
 #define init_debug(...)                                                \
@@ -52,18 +64,269 @@ static __initdata bool debug;
                        pr_info(__VA_ARGS__);                   \
        } while (0)
 
-static void __init major_lsm_init(void)
+static bool __init is_enabled(struct lsm_info *lsm)
 {
-       struct lsm_info *lsm;
-       int ret;
+       if (!lsm->enabled)
+               return false;
+
+       return *lsm->enabled;
+}
+
+/* Mark an LSM's enabled flag. */
+static int lsm_enabled_true __initdata = 1;
+static int lsm_enabled_false __initdata = 0;
+static void __init set_enabled(struct lsm_info *lsm, bool enabled)
+{
+       /*
+        * When an LSM hasn't configured an enable variable, we can use
+        * a hard-coded location for storing the default enabled state.
+        */
+       if (!lsm->enabled) {
+               if (enabled)
+                       lsm->enabled = &lsm_enabled_true;
+               else
+                       lsm->enabled = &lsm_enabled_false;
+       } else if (lsm->enabled == &lsm_enabled_true) {
+               if (!enabled)
+                       lsm->enabled = &lsm_enabled_false;
+       } else if (lsm->enabled == &lsm_enabled_false) {
+               if (enabled)
+                       lsm->enabled = &lsm_enabled_true;
+       } else {
+               *lsm->enabled = enabled;
+       }
+}
+
+/* Is an LSM already listed in the ordered LSMs list? */
+static bool __init exists_ordered_lsm(struct lsm_info *lsm)
+{
+       struct lsm_info **check;
+
+       for (check = ordered_lsms; *check; check++)
+               if (*check == lsm)
+                       return true;
+
+       return false;
+}
+
+/* Append an LSM to the list of ordered LSMs to initialize. */
+static int last_lsm __initdata;
+static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
+{
+       /* Ignore duplicate selections. */
+       if (exists_ordered_lsm(lsm))
+               return;
+
+       if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
+               return;
+
+       /* Enable this LSM, if it is not already set. */
+       if (!lsm->enabled)
+               lsm->enabled = &lsm_enabled_true;
+       ordered_lsms[last_lsm++] = lsm;
+
+       init_debug("%s ordering: %s (%sabled)\n", from, lsm->name,
+                  is_enabled(lsm) ? "en" : "dis");
+}
+
+/* Is an LSM allowed to be initialized? */
+static bool __init lsm_allowed(struct lsm_info *lsm)
+{
+       /* Skip if the LSM is disabled. */
+       if (!is_enabled(lsm))
+               return false;
+
+       /* Not allowed if another exclusive LSM already initialized. */
+       if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
+               init_debug("exclusive disabled: %s\n", lsm->name);
+               return false;
+       }
+
+       return true;
+}
+
+static void __init lsm_set_blob_size(int *need, int *lbs)
+{
+       int offset;
+
+       if (*need > 0) {
+               offset = *lbs;
+               *lbs += *need;
+               *need = offset;
+       }
+}
+
+static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
+{
+       if (!needed)
+               return;
+
+       lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
+       lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
+       /*
+        * The inode blob gets an rcu_head in addition to
+        * what the modules might need.
+        */
+       if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
+               blob_sizes.lbs_inode = sizeof(struct rcu_head);
+       lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
+       lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
+       lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+       lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
+}
+
+/* Prepare LSM for initialization. */
+static void __init prepare_lsm(struct lsm_info *lsm)
+{
+       int enabled = lsm_allowed(lsm);
+
+       /* Record enablement (to handle any following exclusive LSMs). */
+       set_enabled(lsm, enabled);
+
+       /* If enabled, do pre-initialization work. */
+       if (enabled) {
+               if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
+                       exclusive = lsm;
+                       init_debug("exclusive chosen: %s\n", lsm->name);
+               }
+
+               lsm_set_blob_sizes(lsm->blobs);
+       }
+}
+
+/* Initialize a given LSM, if it is enabled. */
+static void __init initialize_lsm(struct lsm_info *lsm)
+{
+       if (is_enabled(lsm)) {
+               int ret;
 
-       for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
 }
 
+/* Populate ordered LSMs list from comma-separated LSM name list. */
+static void __init ordered_lsm_parse(const char *order, const char *origin)
+{
+       struct lsm_info *lsm;
+       char *sep, *name, *next;
+
+       /* LSM_ORDER_FIRST is always first. */
+       for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+               if (lsm->order == LSM_ORDER_FIRST)
+                       append_ordered_lsm(lsm, "first");
+       }
+
+       /* Process "security=", if given. */
+       if (chosen_major_lsm) {
+               struct lsm_info *major;
+
+               /*
+                * To match the original "security=" behavior, this
+                * explicitly does NOT fallback to another Legacy Major
+                * if the selected one was separately disabled: disable
+                * all non-matching Legacy Major LSMs.
+                */
+               for (major = __start_lsm_info; major < __end_lsm_info;
+                    major++) {
+                       if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
+                           strcmp(major->name, chosen_major_lsm) != 0) {
+                               set_enabled(major, false);
+                               init_debug("security=%s disabled: %s\n",
+                                          chosen_major_lsm, major->name);
+                       }
+               }
+       }
+
+       sep = kstrdup(order, GFP_KERNEL);
+       next = sep;
+       /* Walk the list, looking for matching LSMs. */
+       while ((name = strsep(&next, ",")) != NULL) {
+               bool found = false;
+
+               for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+                       if (lsm->order == LSM_ORDER_MUTABLE &&
+                           strcmp(lsm->name, name) == 0) {
+                               append_ordered_lsm(lsm, origin);
+                               found = true;
+                       }
+               }
+
+               if (!found)
+                       init_debug("%s ignored: %s\n", origin, name);
+       }
+
+       /* Process "security=", if given. */
+       if (chosen_major_lsm) {
+               for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+                       if (exists_ordered_lsm(lsm))
+                               continue;
+                       if (strcmp(lsm->name, chosen_major_lsm) == 0)
+                               append_ordered_lsm(lsm, "security=");
+               }
+       }
+
+       /* Disable all LSMs not in the ordered list. */
+       for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
+               if (exists_ordered_lsm(lsm))
+                       continue;
+               set_enabled(lsm, false);
+               init_debug("%s disabled: %s\n", origin, lsm->name);
+       }
+
+       kfree(sep);
+}
+
+static void __init lsm_early_cred(struct cred *cred);
+static void __init lsm_early_task(struct task_struct *task);
+
+static void __init ordered_lsm_init(void)
+{
+       struct lsm_info **lsm;
+
+       ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
+                               GFP_KERNEL);
+
+       if (chosen_lsm_order) {
+               if (chosen_major_lsm) {
+                       pr_info("security= is ignored because it is superseded by lsm=\n");
+                       chosen_major_lsm = NULL;
+               }
+               ordered_lsm_parse(chosen_lsm_order, "cmdline");
+       } else
+               ordered_lsm_parse(builtin_lsm_order, "builtin");
+
+       for (lsm = ordered_lsms; *lsm; lsm++)
+               prepare_lsm(*lsm);
+
+       init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
+       init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
+       init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
+       init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
+       init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
+       init_debug("task blob size     = %d\n", blob_sizes.lbs_task);
+
+       /*
+        * Create any kmem_caches needed for blobs
+        */
+       if (blob_sizes.lbs_file)
+               lsm_file_cache = kmem_cache_create("lsm_file_cache",
+                                                  blob_sizes.lbs_file, 0,
+                                                  SLAB_PANIC, NULL);
+       if (blob_sizes.lbs_inode)
+               lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
+                                                   blob_sizes.lbs_inode, 0,
+                                                   SLAB_PANIC, NULL);
+
+       lsm_early_cred((struct cred *) current->cred);
+       lsm_early_task(current);
+       for (lsm = ordered_lsms; *lsm; lsm++)
+               initialize_lsm(*lsm);
+
+       kfree(ordered_lsms);
+}
+
 /**
  * security_init - initializes the security framework
  *
@@ -80,28 +343,27 @@ int __init security_init(void)
             i++)
                INIT_HLIST_HEAD(&list[i]);
 
-       /*
-        * Load minor LSMs, with the capability module always first.
-        */
-       capability_add_hooks();
-       yama_add_hooks();
-       loadpin_add_hooks();
-
-       /*
-        * Load all the remaining security modules.
-        */
-       major_lsm_init();
+       /* Load LSMs in specified order. */
+       ordered_lsm_init();
 
        return 0;
 }
 
 /* Save user chosen LSM */
-static int __init choose_lsm(char *str)
+static int __init choose_major_lsm(char *str)
+{
+       chosen_major_lsm = str;
+       return 1;
+}
+__setup("security=", choose_major_lsm);
+
+/* Explicitly choose LSM initialization order. */
+static int __init choose_lsm_order(char *str)
 {
-       strncpy(chosen_lsm, str, SECURITY_NAME_MAX);
+       chosen_lsm_order = str;
        return 1;
 }
-__setup("security=", choose_lsm);
+__setup("lsm=", choose_lsm_order);
 
 /* Enable LSM order debugging. */
 static int __init enable_debug(char *str)
@@ -147,29 +409,6 @@ static int lsm_append(char *new, char **result)
        return 0;
 }
 
-/**
- * security_module_enable - Load given security module on boot ?
- * @module: the name of the module
- *
- * Each LSM must pass this method before registering its own operations
- * to avoid security registration races. This method may also be used
- * to check if your LSM is currently loaded during kernel initialization.
- *
- * Returns:
- *
- * true if:
- *
- * - The passed LSM is the one chosen by user at boot time,
- * - or the passed LSM is configured as the default and the user did not
- *   choose an alternate LSM at boot time.
- *
- * Otherwise, return false.
- */
-int __init security_module_enable(const char *module)
-{
-       return !strcmp(module, chosen_lsm);
-}
-
 /**
  * security_add_hooks - Add a modules hooks to the hook lists.
  * @hooks: the hooks to add
@@ -209,6 +448,161 @@ int unregister_lsm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_lsm_notifier);
 
+/**
+ * lsm_cred_alloc - allocate a composite cred blob
+ * @cred: the cred that needs a blob
+ * @gfp: allocation type
+ *
+ * Allocate the cred blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
+{
+       if (blob_sizes.lbs_cred == 0) {
+               cred->security = NULL;
+               return 0;
+       }
+
+       cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
+       if (cred->security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_early_cred - during initialization allocate a composite cred blob
+ * @cred: the cred that needs a blob
+ *
+ * Allocate the cred blob for all the modules
+ */
+static void __init lsm_early_cred(struct cred *cred)
+{
+       int rc = lsm_cred_alloc(cred, GFP_KERNEL);
+
+       if (rc)
+               panic("%s: Early cred alloc failed.\n", __func__);
+}
+
+/**
+ * lsm_file_alloc - allocate a composite file blob
+ * @file: the file that needs a blob
+ *
+ * Allocate the file blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_file_alloc(struct file *file)
+{
+       if (!lsm_file_cache) {
+               file->f_security = NULL;
+               return 0;
+       }
+
+       file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
+       if (file->f_security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_inode_alloc - allocate a composite inode blob
+ * @inode: the inode that needs a blob
+ *
+ * Allocate the inode blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+int lsm_inode_alloc(struct inode *inode)
+{
+       if (!lsm_inode_cache) {
+               inode->i_security = NULL;
+               return 0;
+       }
+
+       inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
+       if (inode->i_security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_task_alloc - allocate a composite task blob
+ * @task: the task that needs a blob
+ *
+ * Allocate the task blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_task_alloc(struct task_struct *task)
+{
+       if (blob_sizes.lbs_task == 0) {
+               task->security = NULL;
+               return 0;
+       }
+
+       task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
+       if (task->security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_ipc_alloc - allocate a composite ipc blob
+ * @kip: the ipc that needs a blob
+ *
+ * Allocate the ipc blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
+{
+       if (blob_sizes.lbs_ipc == 0) {
+               kip->security = NULL;
+               return 0;
+       }
+
+       kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
+       if (kip->security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_msg_msg_alloc - allocate a composite msg_msg blob
+ * @mp: the msg_msg that needs a blob
+ *
+ * Allocate the ipc blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_msg_msg_alloc(struct msg_msg *mp)
+{
+       if (blob_sizes.lbs_msg_msg == 0) {
+               mp->security = NULL;
+               return 0;
+       }
+
+       mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
+       if (mp->security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * lsm_early_task - during initialization allocate a composite task blob
+ * @task: the task that needs a blob
+ *
+ * Allocate the task blob for all the modules
+ */
+static void __init lsm_early_task(struct task_struct *task)
+{
+       int rc = lsm_task_alloc(task);
+
+       if (rc)
+               panic("%s: Early task alloc failed.\n", __func__);
+}
+
 /*
  * Hook list operation macros.
  *
@@ -294,16 +688,12 @@ int security_capset(struct cred *new, const struct cred *old,
                                effective, inheritable, permitted);
 }
 
-int security_capable(const struct cred *cred, struct user_namespace *ns,
-                    int cap)
+int security_capable(const struct cred *cred,
+                    struct user_namespace *ns,
+                    int cap,
+                    unsigned int opts)
 {
-       return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_AUDIT);
-}
-
-int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
-                            int cap)
-{
-       return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_NOAUDIT);
+       return call_int_hook(capable, 0, cred, ns, cap, opts);
 }
 
 int security_quotactl(int cmds, int type, int id, struct super_block *sb)
@@ -468,14 +858,40 @@ EXPORT_SYMBOL(security_add_mnt_opt);
 
 int security_inode_alloc(struct inode *inode)
 {
-       inode->i_security = NULL;
-       return call_int_hook(inode_alloc_security, 0, inode);
+       int rc = lsm_inode_alloc(inode);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(inode_alloc_security, 0, inode);
+       if (unlikely(rc))
+               security_inode_free(inode);
+       return rc;
+}
+
+static void inode_free_by_rcu(struct rcu_head *head)
+{
+       /*
+        * The rcu head is at the start of the inode blob
+        */
+       kmem_cache_free(lsm_inode_cache, head);
 }
 
 void security_inode_free(struct inode *inode)
 {
        integrity_inode_free(inode);
        call_void_hook(inode_free_security, inode);
+       /*
+        * The inode may still be referenced in a path walk and
+        * a call to security_inode_permission() can be made
+        * after inode_free_security() is called. Ideally, the VFS
+        * wouldn't do this, but fixing that is a much harder
+        * job. For now, simply free the i_security via RCU, and
+        * leave the current inode->i_security pointer intact.
+        * The inode will be freed after the RCU grace period too.
+        */
+       if (inode->i_security)
+               call_rcu((struct rcu_head *)inode->i_security,
+                               inode_free_by_rcu);
 }
 
 int security_dentry_init_security(struct dentry *dentry, int mode,
@@ -905,12 +1321,27 @@ int security_file_permission(struct file *file, int mask)
 
 int security_file_alloc(struct file *file)
 {
-       return call_int_hook(file_alloc_security, 0, file);
+       int rc = lsm_file_alloc(file);
+
+       if (rc)
+               return rc;
+       rc = call_int_hook(file_alloc_security, 0, file);
+       if (unlikely(rc))
+               security_file_free(file);
+       return rc;
 }
 
 void security_file_free(struct file *file)
 {
+       void *blob;
+
        call_void_hook(file_free_security, file);
+
+       blob = file->f_security;
+       if (blob) {
+               file->f_security = NULL;
+               kmem_cache_free(lsm_file_cache, blob);
+       }
 }
 
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1012,17 +1443,35 @@ int security_file_open(struct file *file)
 
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
 {
-       return call_int_hook(task_alloc, 0, task, clone_flags);
+       int rc = lsm_task_alloc(task);
+
+       if (rc)
+               return rc;
+       rc = call_int_hook(task_alloc, 0, task, clone_flags);
+       if (unlikely(rc))
+               security_task_free(task);
+       return rc;
 }
 
 void security_task_free(struct task_struct *task)
 {
        call_void_hook(task_free, task);
+
+       kfree(task->security);
+       task->security = NULL;
 }
 
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
-       return call_int_hook(cred_alloc_blank, 0, cred, gfp);
+       int rc = lsm_cred_alloc(cred, gfp);
+
+       if (rc)
+               return rc;
+
+       rc = call_int_hook(cred_alloc_blank, 0, cred, gfp);
+       if (unlikely(rc))
+               security_cred_free(cred);
+       return rc;
 }
 
 void security_cred_free(struct cred *cred)
@@ -1035,11 +1484,22 @@ void security_cred_free(struct cred *cred)
                return;
 
        call_void_hook(cred_free, cred);
+
+       kfree(cred->security);
+       cred->security = NULL;
 }
 
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 {
-       return call_int_hook(cred_prepare, 0, new, old, gfp);
+       int rc = lsm_cred_alloc(new, gfp);
+
+       if (rc)
+               return rc;
+
+       rc = call_int_hook(cred_prepare, 0, new, old, gfp);
+       if (unlikely(rc))
+               security_cred_free(new);
+       return rc;
 }
 
 void security_transfer_creds(struct cred *new, const struct cred *old)
@@ -1220,22 +1680,40 @@ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
 
 int security_msg_msg_alloc(struct msg_msg *msg)
 {
-       return call_int_hook(msg_msg_alloc_security, 0, msg);
+       int rc = lsm_msg_msg_alloc(msg);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(msg_msg_alloc_security, 0, msg);
+       if (unlikely(rc))
+               security_msg_msg_free(msg);
+       return rc;
 }
 
 void security_msg_msg_free(struct msg_msg *msg)
 {
        call_void_hook(msg_msg_free_security, msg);
+       kfree(msg->security);
+       msg->security = NULL;
 }
 
 int security_msg_queue_alloc(struct kern_ipc_perm *msq)
 {
-       return call_int_hook(msg_queue_alloc_security, 0, msq);
+       int rc = lsm_ipc_alloc(msq);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(msg_queue_alloc_security, 0, msq);
+       if (unlikely(rc))
+               security_msg_queue_free(msq);
+       return rc;
 }
 
 void security_msg_queue_free(struct kern_ipc_perm *msq)
 {
        call_void_hook(msg_queue_free_security, msq);
+       kfree(msq->security);
+       msq->security = NULL;
 }
 
 int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
@@ -1262,12 +1740,21 @@ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
 
 int security_shm_alloc(struct kern_ipc_perm *shp)
 {
-       return call_int_hook(shm_alloc_security, 0, shp);
+       int rc = lsm_ipc_alloc(shp);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(shm_alloc_security, 0, shp);
+       if (unlikely(rc))
+               security_shm_free(shp);
+       return rc;
 }
 
 void security_shm_free(struct kern_ipc_perm *shp)
 {
        call_void_hook(shm_free_security, shp);
+       kfree(shp->security);
+       shp->security = NULL;
 }
 
 int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
@@ -1287,12 +1774,21 @@ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmf
 
 int security_sem_alloc(struct kern_ipc_perm *sma)
 {
-       return call_int_hook(sem_alloc_security, 0, sma);
+       int rc = lsm_ipc_alloc(sma);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(sem_alloc_security, 0, sma);
+       if (unlikely(rc))
+               security_sem_free(sma);
+       return rc;
 }
 
 void security_sem_free(struct kern_ipc_perm *sma)
 {
        call_void_hook(sem_free_security, sma);
+       kfree(sma->security);
+       sma->security = NULL;
 }
 
 int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
@@ -1319,14 +1815,30 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode)
 }
 EXPORT_SYMBOL(security_d_instantiate);
 
-int security_getprocattr(struct task_struct *p, char *name, char **value)
+int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
+                               char **value)
 {
-       return call_int_hook(getprocattr, -EINVAL, p, name, value);
+       struct security_hook_list *hp;
+
+       hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
+               if (lsm != NULL && strcmp(lsm, hp->lsm))
+                       continue;
+               return hp->hook.getprocattr(p, name, value);
+       }
+       return -EINVAL;
 }
 
-int security_setprocattr(const char *name, void *value, size_t size)
+int security_setprocattr(const char *lsm, const char *name, void *value,
+                        size_t size)
 {
-       return call_int_hook(setprocattr, -EINVAL, name, value, size);
+       struct security_hook_list *hp;
+
+       hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
+               if (lsm != NULL && strcmp(lsm, hp->lsm))
+                       continue;
+               return hp->hook.setprocattr(name, value, size);
+       }
+       return -EINVAL;
 }
 
 int security_netlink_send(struct sock *sk, struct sk_buff *skb)
@@ -1790,11 +2302,9 @@ void security_audit_rule_free(void *lsmrule)
        call_void_hook(audit_rule_free, lsmrule);
 }
 
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
-                             struct audit_context *actx)
+int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
 {
-       return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule,
-                               actx);
+       return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
 }
 #endif /* CONFIG_AUDIT */
 
index 8af7a690eb40a15d06f1cba8b5ee25f60c8cdbf9..55f032f1fc2d84aff37a49b519a30f4d051b2018 100644 (file)
@@ -22,21 +22,6 @@ config SECURITY_SELINUX_BOOTPARAM
 
          If you are unsure how to answer this question, answer N.
 
-config SECURITY_SELINUX_BOOTPARAM_VALUE
-       int "NSA SELinux boot parameter default value"
-       depends on SECURITY_SELINUX_BOOTPARAM
-       range 0 1
-       default 1
-       help
-         This option sets the default value for the kernel parameter
-         'selinux', which allows SELinux to be disabled at boot.  If this
-         option is set to 0 (zero), the SELinux kernel parameter will
-         default to 0, disabling SELinux at bootup.  If this option is
-         set to 1 (one), the SELinux kernel parameter will default to 1,
-         enabling SELinux at bootup.
-
-         If you are unsure how to answer this question, answer 1.
-
 config SECURITY_SELINUX_DISABLE
        bool "NSA SELinux runtime disable"
        depends on SECURITY_SELINUX
index c7161f8792b2dc9312df1b600eee8d4adb571482..ccf95040938419a6980de885f5331380e24a6e9e 100644 (file)
@@ -6,7 +6,7 @@
 obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
 
 selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
-            netnode.o netport.o ibpkey.o exports.o \
+            netnode.o netport.o ibpkey.o \
             ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
             ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
 
index 635e5c1e3e48ff90c491b3d7376e95b01669715b..8346a4f7c5d7802289371f2dc786d02133be863b 100644 (file)
@@ -129,75 +129,6 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
        return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
 }
 
-/**
- * avc_dump_av - Display an access vector in human-readable form.
- * @tclass: target security class
- * @av: access vector
- */
-static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
-{
-       const char **perms;
-       int i, perm;
-
-       if (av == 0) {
-               audit_log_format(ab, " null");
-               return;
-       }
-
-       BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
-       perms = secclass_map[tclass-1].perms;
-
-       audit_log_format(ab, " {");
-       i = 0;
-       perm = 1;
-       while (i < (sizeof(av) * 8)) {
-               if ((perm & av) && perms[i]) {
-                       audit_log_format(ab, " %s", perms[i]);
-                       av &= ~perm;
-               }
-               i++;
-               perm <<= 1;
-       }
-
-       if (av)
-               audit_log_format(ab, " 0x%x", av);
-
-       audit_log_format(ab, " }");
-}
-
-/**
- * avc_dump_query - Display a SID pair and a class in human-readable form.
- * @ssid: source security identifier
- * @tsid: target security identifier
- * @tclass: target security class
- */
-static void avc_dump_query(struct audit_buffer *ab, struct selinux_state *state,
-                          u32 ssid, u32 tsid, u16 tclass)
-{
-       int rc;
-       char *scontext;
-       u32 scontext_len;
-
-       rc = security_sid_to_context(state, ssid, &scontext, &scontext_len);
-       if (rc)
-               audit_log_format(ab, "ssid=%d", ssid);
-       else {
-               audit_log_format(ab, "scontext=%s", scontext);
-               kfree(scontext);
-       }
-
-       rc = security_sid_to_context(state, tsid, &scontext, &scontext_len);
-       if (rc)
-               audit_log_format(ab, " tsid=%d", tsid);
-       else {
-               audit_log_format(ab, " tcontext=%s", scontext);
-               kfree(scontext);
-       }
-
-       BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
-       audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name);
-}
-
 /**
  * avc_init - Initialize the AVC.
  *
@@ -735,11 +666,36 @@ out:
 static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
 {
        struct common_audit_data *ad = a;
-       audit_log_format(ab, "avc:  %s ",
-                        ad->selinux_audit_data->denied ? "denied" : "granted");
-       avc_dump_av(ab, ad->selinux_audit_data->tclass,
-                       ad->selinux_audit_data->audited);
-       audit_log_format(ab, " for ");
+       struct selinux_audit_data *sad = ad->selinux_audit_data;
+       u32 av = sad->audited;
+       const char **perms;
+       int i, perm;
+
+       audit_log_format(ab, "avc:  %s ", sad->denied ? "denied" : "granted");
+
+       if (av == 0) {
+               audit_log_format(ab, " null");
+               return;
+       }
+
+       perms = secclass_map[sad->tclass-1].perms;
+
+       audit_log_format(ab, " {");
+       i = 0;
+       perm = 1;
+       while (i < (sizeof(av) * 8)) {
+               if ((perm & av) && perms[i]) {
+                       audit_log_format(ab, " %s", perms[i]);
+                       av &= ~perm;
+               }
+               i++;
+               perm <<= 1;
+       }
+
+       if (av)
+               audit_log_format(ab, " 0x%x", av);
+
+       audit_log_format(ab, " } for ");
 }
 
 /**
@@ -751,14 +707,47 @@ static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
 static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
 {
        struct common_audit_data *ad = a;
-       audit_log_format(ab, " ");
-       avc_dump_query(ab, ad->selinux_audit_data->state,
-                      ad->selinux_audit_data->ssid,
-                      ad->selinux_audit_data->tsid,
-                      ad->selinux_audit_data->tclass);
-       if (ad->selinux_audit_data->denied) {
-               audit_log_format(ab, " permissive=%u",
-                                ad->selinux_audit_data->result ? 0 : 1);
+       struct selinux_audit_data *sad = ad->selinux_audit_data;
+       char *scontext;
+       u32 scontext_len;
+       int rc;
+
+       rc = security_sid_to_context(sad->state, sad->ssid, &scontext,
+                                    &scontext_len);
+       if (rc)
+               audit_log_format(ab, " ssid=%d", sad->ssid);
+       else {
+               audit_log_format(ab, " scontext=%s", scontext);
+               kfree(scontext);
+       }
+
+       rc = security_sid_to_context(sad->state, sad->tsid, &scontext,
+                                    &scontext_len);
+       if (rc)
+               audit_log_format(ab, " tsid=%d", sad->tsid);
+       else {
+               audit_log_format(ab, " tcontext=%s", scontext);
+               kfree(scontext);
+       }
+
+       audit_log_format(ab, " tclass=%s", secclass_map[sad->tclass-1].name);
+
+       if (sad->denied)
+               audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);
+
+       /* in case of invalid context report also the actual context string */
+       rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext,
+                                          &scontext_len);
+       if (!rc && scontext) {
+               audit_log_format(ab, " srawcon=%s", scontext);
+               kfree(scontext);
+       }
+
+       rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext,
+                                          &scontext_len);
+       if (!rc && scontext) {
+               audit_log_format(ab, " trawcon=%s", scontext);
+               kfree(scontext);
        }
 }
 
@@ -772,6 +761,9 @@ noinline int slow_avc_audit(struct selinux_state *state,
        struct common_audit_data stack_data;
        struct selinux_audit_data sad;
 
+       if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
+               return -EINVAL;
+
        if (!a) {
                a = &stack_data;
                a->type = LSM_AUDIT_DATA_NONE;
@@ -838,6 +830,7 @@ out:
  * @ssid,@tsid,@tclass : identifier of an AVC entry
  * @seqno : sequence number when decision was made
  * @xpd: extended_perms_decision to be added to the node
+ * @flags: the AVC_* flags, e.g. AVC_NONBLOCKING, AVC_EXTENDED_PERMS, or 0.
  *
  * if a valid AVC entry doesn't exist,this function returns -ENOENT.
  * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
@@ -856,6 +849,22 @@ static int avc_update_node(struct selinux_avc *avc,
        struct hlist_head *head;
        spinlock_t *lock;
 
+       /*
+        * If we are in a non-blocking code path, e.g. VFS RCU walk,
+        * then we must not add permissions to a cache entry
+        * because we cannot safely audit the denial.  Otherwise,
+        * during the subsequent blocking retry (e.g. VFS ref walk), we
+        * will find the permissions already granted in the cache entry
+        * and won't audit anything at all, leading to silent denials in
+        * permissive mode that only appear when in enforcing mode.
+        *
+        * See the corresponding handling in slow_avc_audit(), and the
+        * logic in selinux_inode_permission for the MAY_NOT_BLOCK flag,
+        * which is transliterated into AVC_NONBLOCKING.
+        */
+       if (flags & AVC_NONBLOCKING)
+               return 0;
+
        node = avc_alloc_node(avc);
        if (!node) {
                rc = -ENOMEM;
@@ -1050,7 +1059,8 @@ int avc_has_extended_perms(struct selinux_state *state,
        int rc = 0, rc2;
 
        xp_node = &local_xp_node;
-       BUG_ON(!requested);
+       if (WARN_ON(!requested))
+               return -EACCES;
 
        rcu_read_lock();
 
@@ -1115,7 +1125,7 @@ decision:
  * @tsid: target security identifier
  * @tclass: target security class
  * @requested: requested permissions, interpreted based on @tclass
- * @flags:  AVC_STRICT or 0
+ * @flags:  AVC_STRICT, AVC_NONBLOCKING, or 0
  * @avd: access vector decisions
  *
  * Check the AVC to determine whether the @requested permissions are granted
@@ -1140,7 +1150,8 @@ inline int avc_has_perm_noaudit(struct selinux_state *state,
        int rc = 0;
        u32 denied;
 
-       BUG_ON(!requested);
+       if (WARN_ON(!requested))
+               return -EACCES;
 
        rcu_read_lock();
 
@@ -1191,24 +1202,6 @@ int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass,
        return rc;
 }
 
-int avc_has_perm_flags(struct selinux_state *state,
-                      u32 ssid, u32 tsid, u16 tclass, u32 requested,
-                      struct common_audit_data *auditdata,
-                      int flags)
-{
-       struct av_decision avd;
-       int rc, rc2;
-
-       rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0,
-                                 &avd);
-
-       rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
-                       auditdata, flags);
-       if (rc2)
-               return rc2;
-       return rc;
-}
-
 u32 avc_policy_seqno(struct selinux_state *state)
 {
        return state->avc->avc_cache.latest_notif;
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
deleted file mode 100644 (file)
index e75dd94..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * SELinux services exported to the rest of the kernel.
- *
- * Author: James Morris <jmorris@redhat.com>
- *
- * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
- * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
- * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2,
- * as published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/selinux.h>
-
-#include "security.h"
-
-bool selinux_is_enabled(void)
-{
-       return selinux_enabled;
-}
-EXPORT_SYMBOL_GPL(selinux_is_enabled);
index f0e36c3492baee41642fbd232625c94720ac31a4..2f82a54f870382440b621fde195ce69859bd286a 100644 (file)
@@ -79,7 +79,6 @@
 #include <linux/personality.h>
 #include <linux/audit.h>
 #include <linux/string.h>
-#include <linux/selinux.h>
 #include <linux/mutex.h>
 #include <linux/posix-timers.h>
 #include <linux/syslog.h>
@@ -121,9 +120,8 @@ __setup("enforcing=", enforcing_setup);
 #define selinux_enforcing_boot 1
 #endif
 
+int selinux_enabled __lsm_ro_after_init = 1;
 #ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
-int selinux_enabled = CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE;
-
 static int __init selinux_enabled_setup(char *str)
 {
        unsigned long enabled;
@@ -132,8 +130,6 @@ static int __init selinux_enabled_setup(char *str)
        return 1;
 }
 __setup("selinux=", selinux_enabled_setup);
-#else
-int selinux_enabled = 1;
 #endif
 
 static unsigned int selinux_checkreqprot_boot =
@@ -149,9 +145,6 @@ static int __init checkreqprot_setup(char *str)
 }
 __setup("checkreqprot=", checkreqprot_setup);
 
-static struct kmem_cache *sel_inode_cache;
-static struct kmem_cache *file_security_cache;
-
 /**
  * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
  *
@@ -214,12 +207,8 @@ static void cred_init_security(void)
        struct cred *cred = (struct cred *) current->real_cred;
        struct task_security_struct *tsec;
 
-       tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
-       if (!tsec)
-               panic("SELinux:  Failed to initialize initial task.\n");
-
+       tsec = selinux_cred(cred);
        tsec->osid = tsec->sid = SECINITSID_KERNEL;
-       cred->security = tsec;
 }
 
 /*
@@ -229,7 +218,7 @@ static inline u32 cred_sid(const struct cred *cred)
 {
        const struct task_security_struct *tsec;
 
-       tsec = cred->security;
+       tsec = selinux_cred(cred);
        return tsec->sid;
 }
 
@@ -250,13 +239,9 @@ static inline u32 task_sid(const struct task_struct *task)
 
 static int inode_alloc_security(struct inode *inode)
 {
-       struct inode_security_struct *isec;
+       struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = current_sid();
 
-       isec = kmem_cache_zalloc(sel_inode_cache, GFP_NOFS);
-       if (!isec)
-               return -ENOMEM;
-
        spin_lock_init(&isec->lock);
        INIT_LIST_HEAD(&isec->list);
        isec->inode = inode;
@@ -264,7 +249,6 @@ static int inode_alloc_security(struct inode *inode)
        isec->sclass = SECCLASS_FILE;
        isec->task_sid = sid;
        isec->initialized = LABEL_INVALID;
-       inode->i_security = isec;
 
        return 0;
 }
@@ -281,7 +265,7 @@ static int __inode_security_revalidate(struct inode *inode,
                                       struct dentry *dentry,
                                       bool may_sleep)
 {
-       struct inode_security_struct *isec = inode->i_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
 
        might_sleep_if(may_sleep);
 
@@ -302,7 +286,7 @@ static int __inode_security_revalidate(struct inode *inode,
 
 static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
 {
-       return inode->i_security;
+       return selinux_inode(inode);
 }
 
 static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu)
@@ -312,7 +296,7 @@ static struct inode_security_struct *inode_security_rcu(struct inode *inode, boo
        error = __inode_security_revalidate(inode, NULL, !rcu);
        if (error)
                return ERR_PTR(error);
-       return inode->i_security;
+       return selinux_inode(inode);
 }
 
 /*
@@ -321,14 +305,14 @@ static struct inode_security_struct *inode_security_rcu(struct inode *inode, boo
 static struct inode_security_struct *inode_security(struct inode *inode)
 {
        __inode_security_revalidate(inode, NULL, true);
-       return inode->i_security;
+       return selinux_inode(inode);
 }
 
 static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
 {
        struct inode *inode = d_backing_inode(dentry);
 
-       return inode->i_security;
+       return selinux_inode(inode);
 }
 
 /*
@@ -339,22 +323,17 @@ static struct inode_security_struct *backing_inode_security(struct dentry *dentr
        struct inode *inode = d_backing_inode(dentry);
 
        __inode_security_revalidate(inode, dentry, true);
-       return inode->i_security;
-}
-
-static void inode_free_rcu(struct rcu_head *head)
-{
-       struct inode_security_struct *isec;
-
-       isec = container_of(head, struct inode_security_struct, rcu);
-       kmem_cache_free(sel_inode_cache, isec);
+       return selinux_inode(inode);
 }
 
 static void inode_free_security(struct inode *inode)
 {
-       struct inode_security_struct *isec = inode->i_security;
-       struct superblock_security_struct *sbsec = inode->i_sb->s_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
+       struct superblock_security_struct *sbsec;
 
+       if (!isec)
+               return;
+       sbsec = inode->i_sb->s_security;
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
@@ -370,42 +349,19 @@ static void inode_free_security(struct inode *inode)
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
        }
-
-       /*
-        * The inode may still be referenced in a path walk and
-        * a call to selinux_inode_permission() can be made
-        * after inode_free_security() is called. Ideally, the VFS
-        * wouldn't do this, but fixing that is a much harder
-        * job. For now, simply free the i_security via RCU, and
-        * leave the current inode->i_security pointer intact.
-        * The inode will be freed after the RCU grace period too.
-        */
-       call_rcu(&isec->rcu, inode_free_rcu);
 }
 
 static int file_alloc_security(struct file *file)
 {
-       struct file_security_struct *fsec;
+       struct file_security_struct *fsec = selinux_file(file);
        u32 sid = current_sid();
 
-       fsec = kmem_cache_zalloc(file_security_cache, GFP_KERNEL);
-       if (!fsec)
-               return -ENOMEM;
-
        fsec->sid = sid;
        fsec->fown_sid = sid;
-       file->f_security = fsec;
 
        return 0;
 }
 
-static void file_free_security(struct file *file)
-{
-       struct file_security_struct *fsec = file->f_security;
-       file->f_security = NULL;
-       kmem_cache_free(file_security_cache, fsec);
-}
-
 static int superblock_alloc_security(struct super_block *sb)
 {
        struct superblock_security_struct *sbsec;
@@ -501,7 +457,7 @@ static int may_context_mount_sb_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
 {
-       const struct task_security_struct *tsec = cred->security;
+       const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;
 
        rc = avc_has_perm(&selinux_state,
@@ -520,7 +476,7 @@ static int may_context_mount_inode_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
 {
-       const struct task_security_struct *tsec = cred->security;
+       const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;
        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
@@ -534,16 +490,10 @@ static int may_context_mount_inode_relabel(u32 sid,
        return rc;
 }
 
-static int selinux_is_sblabel_mnt(struct super_block *sb)
+static int selinux_is_genfs_special_handling(struct super_block *sb)
 {
-       struct superblock_security_struct *sbsec = sb->s_security;
-
-       return sbsec->behavior == SECURITY_FS_USE_XATTR ||
-               sbsec->behavior == SECURITY_FS_USE_TRANS ||
-               sbsec->behavior == SECURITY_FS_USE_TASK ||
-               sbsec->behavior == SECURITY_FS_USE_NATIVE ||
-               /* Special handling. Genfs but also in-core setxattr handler */
-               !strcmp(sb->s_type->name, "sysfs") ||
+       /* Special handling. Genfs but also in-core setxattr handler */
+       return  !strcmp(sb->s_type->name, "sysfs") ||
                !strcmp(sb->s_type->name, "pstore") ||
                !strcmp(sb->s_type->name, "debugfs") ||
                !strcmp(sb->s_type->name, "tracefs") ||
@@ -553,6 +503,34 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
                  !strcmp(sb->s_type->name, "cgroup2")));
 }
 
+static int selinux_is_sblabel_mnt(struct super_block *sb)
+{
+       struct superblock_security_struct *sbsec = sb->s_security;
+
+       /*
+        * IMPORTANT: Double-check logic in this function when adding a new
+        * SECURITY_FS_USE_* definition!
+        */
+       BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);
+
+       switch (sbsec->behavior) {
+       case SECURITY_FS_USE_XATTR:
+       case SECURITY_FS_USE_TRANS:
+       case SECURITY_FS_USE_TASK:
+       case SECURITY_FS_USE_NATIVE:
+               return 1;
+
+       case SECURITY_FS_USE_GENFS:
+               return selinux_is_genfs_special_handling(sb);
+
+       /* Never allow relabeling on context mounts */
+       case SECURITY_FS_USE_MNTPOINT:
+       case SECURITY_FS_USE_NONE:
+       default:
+               return 0;
+       }
+}
+
 static int sb_finish_set_opts(struct super_block *sb)
 {
        struct superblock_security_struct *sbsec = sb->s_security;
@@ -1374,7 +1352,7 @@ static int selinux_genfs_get_sid(struct dentry *dentry,
 static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
 {
        struct superblock_security_struct *sbsec = NULL;
-       struct inode_security_struct *isec = inode->i_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
        u32 task_sid, sid = 0;
        u16 sclass;
        struct dentry *dentry;
@@ -1621,7 +1599,7 @@ static inline u32 signal_to_av(int sig)
 
 /* Check whether a task is allowed to use a capability. */
 static int cred_has_capability(const struct cred *cred,
-                              int cap, int audit, bool initns)
+                              int cap, unsigned int opts, bool initns)
 {
        struct common_audit_data ad;
        struct av_decision avd;
@@ -1648,7 +1626,7 @@ static int cred_has_capability(const struct cred *cred,
 
        rc = avc_has_perm_noaudit(&selinux_state,
                                  sid, sid, sclass, av, 0, &avd);
-       if (audit == SECURITY_CAP_AUDIT) {
+       if (!(opts & CAP_OPT_NOAUDIT)) {
                int rc2 = avc_audit(&selinux_state,
                                    sid, sid, sclass, av, &avd, rc, &ad, 0);
                if (rc2)
@@ -1674,7 +1652,7 @@ static int inode_has_perm(const struct cred *cred,
                return 0;
 
        sid = cred_sid(cred);
-       isec = inode->i_security;
+       isec = selinux_inode(inode);
 
        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, perms, adp);
@@ -1740,7 +1718,7 @@ static int file_has_perm(const struct cred *cred,
                         struct file *file,
                         u32 av)
 {
-       struct file_security_struct *fsec = file->f_security;
+       struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct common_audit_data ad;
        u32 sid = cred_sid(cred);
@@ -1806,7 +1784,7 @@ static int may_create(struct inode *dir,
                      struct dentry *dentry,
                      u16 tclass)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *dsec;
        struct superblock_security_struct *sbsec;
        u32 sid, newsid;
@@ -1828,7 +1806,7 @@ static int may_create(struct inode *dir,
        if (rc)
                return rc;
 
-       rc = selinux_determine_inode_label(current_security(), dir,
+       rc = selinux_determine_inode_label(selinux_cred(current_cred()), dir,
                                           &dentry->d_name, tclass, &newsid);
        if (rc)
                return rc;
@@ -2084,7 +2062,7 @@ static int selinux_binder_transfer_file(struct task_struct *from,
                                        struct file *file)
 {
        u32 sid = task_sid(to);
-       struct file_security_struct *fsec = file->f_security;
+       struct file_security_struct *fsec = selinux_file(file);
        struct dentry *dentry = file->f_path.dentry;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
@@ -2168,9 +2146,9 @@ static int selinux_capset(struct cred *new, const struct cred *old,
  */
 
 static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
-                          int cap, int audit)
+                          int cap, unsigned int opts)
 {
-       return cred_has_capability(cred, cap, audit, ns == &init_user_ns);
+       return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
 }
 
 static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
@@ -2244,7 +2222,7 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
        int rc, cap_sys_admin = 0;
 
        rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN,
-                                SECURITY_CAP_NOAUDIT, true);
+                                CAP_OPT_NOAUDIT, true);
        if (rc == 0)
                cap_sys_admin = 1;
 
@@ -2335,8 +2313,8 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
        if (bprm->called_set_creds)
                return 0;
 
-       old_tsec = current_security();
-       new_tsec = bprm->cred->security;
+       old_tsec = selinux_cred(current_cred());
+       new_tsec = selinux_cred(bprm->cred);
        isec = inode_security(inode);
 
        /* Default to the current task SID. */
@@ -2500,7 +2478,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
        struct rlimit *rlim, *initrlim;
        int rc, i;
 
-       new_tsec = bprm->cred->security;
+       new_tsec = selinux_cred(bprm->cred);
        if (new_tsec->sid == new_tsec->osid)
                return;
 
@@ -2543,7 +2521,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
  */
 static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct itimerval itimer;
        u32 osid, sid;
        int rc, i;
@@ -2780,7 +2758,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode,
        u32 newsid;
        int rc;
 
-       rc = selinux_determine_inode_label(current_security(),
+       rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
@@ -2800,14 +2778,14 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
        int rc;
        struct task_security_struct *tsec;
 
-       rc = selinux_determine_inode_label(old->security,
+       rc = selinux_determine_inode_label(selinux_cred(old),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;
 
-       tsec = new->security;
+       tsec = selinux_cred(new);
        tsec->create_sid = newsid;
        return 0;
 }
@@ -2817,7 +2795,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const char **name,
                                       void **value, size_t *len)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct superblock_security_struct *sbsec;
        u32 newsid, clen;
        int rc;
@@ -2827,7 +2805,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 
        newsid = tsec->create_sid;
 
-       rc = selinux_determine_inode_label(current_security(),
+       rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                dir, qstr,
                inode_mode_to_security_class(inode->i_mode),
                &newsid);
@@ -2836,7 +2814,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 
        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
-               struct inode_security_struct *isec = inode->i_security;
+               struct inode_security_struct *isec = selinux_inode(inode);
                isec->sclass = inode_mode_to_security_class(inode->i_mode);
                isec->sid = newsid;
                isec->initialized = LABEL_INITIALIZED;
@@ -2925,9 +2903,8 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
        if (IS_ERR(isec))
                return PTR_ERR(isec);
 
-       return avc_has_perm_flags(&selinux_state,
-                                 sid, isec->sid, isec->sclass, FILE__READ, &ad,
-                                 rcu ? MAY_NOT_BLOCK : 0);
+       return avc_has_perm(&selinux_state,
+                           sid, isec->sid, isec->sclass, FILE__READ, &ad);
 }
 
 static noinline int audit_inode_permission(struct inode *inode,
@@ -2936,7 +2913,7 @@ static noinline int audit_inode_permission(struct inode *inode,
                                           unsigned flags)
 {
        struct common_audit_data ad;
-       struct inode_security_struct *isec = inode->i_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
        int rc;
 
        ad.type = LSM_AUDIT_DATA_INODE;
@@ -2982,7 +2959,9 @@ static int selinux_inode_permission(struct inode *inode, int mask)
                return PTR_ERR(isec);
 
        rc = avc_has_perm_noaudit(&selinux_state,
-                                 sid, isec->sid, isec->sclass, perms, 0, &avd);
+                                 sid, isec->sid, isec->sclass, perms,
+                                 (flags & MAY_NOT_BLOCK) ? AVC_NONBLOCKING : 0,
+                                 &avd);
        audited = avc_audit_required(perms, &avd, rc,
                                     from_access ? FILE__AUDIT_ACCESS : 0,
                                     &denied);
@@ -3031,11 +3010,11 @@ static int selinux_inode_getattr(const struct path *path)
 static bool has_cap_mac_admin(bool audit)
 {
        const struct cred *cred = current_cred();
-       int cap_audit = audit ? SECURITY_CAP_AUDIT : SECURITY_CAP_NOAUDIT;
+       unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;
 
-       if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, cap_audit))
+       if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
                return false;
-       if (cred_has_capability(cred, CAP_MAC_ADMIN, cap_audit, true))
+       if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
                return false;
        return true;
 }
@@ -3241,12 +3220,16 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
 {
        struct inode_security_struct *isec = inode_security_novalidate(inode);
+       struct superblock_security_struct *sbsec = inode->i_sb->s_security;
        u32 newsid;
        int rc;
 
        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;
 
+       if (!(sbsec->flags & SBLABEL_MNT))
+               return -EOPNOTSUPP;
+
        if (!value || !size)
                return -EACCES;
 
@@ -3289,7 +3272,7 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
                        return -ENOMEM;
        }
 
-       tsec = new_creds->security;
+       tsec = selinux_cred(new_creds);
        /* Get label from overlay inode and set it in create_sid */
        selinux_inode_getsecid(d_inode(src), &sid);
        tsec->create_sid = sid;
@@ -3330,7 +3313,7 @@ static int selinux_revalidate_file_permission(struct file *file, int mask)
 static int selinux_file_permission(struct file *file, int mask)
 {
        struct inode *inode = file_inode(file);
-       struct file_security_struct *fsec = file->f_security;
+       struct file_security_struct *fsec = selinux_file(file);
        struct inode_security_struct *isec;
        u32 sid = current_sid();
 
@@ -3352,11 +3335,6 @@ static int selinux_file_alloc_security(struct file *file)
        return file_alloc_security(file);
 }
 
-static void selinux_file_free_security(struct file *file)
-{
-       file_free_security(file);
-}
-
 /*
  * Check whether a task has the ioctl permission and cmd
  * operation to an inode.
@@ -3365,7 +3343,7 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file,
                u32 requested, u16 cmd)
 {
        struct common_audit_data ad;
-       struct file_security_struct *fsec = file->f_security;
+       struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec;
        struct lsm_ioctlop_audit ioctl;
@@ -3435,7 +3413,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
        case KDSKBENT:
        case KDSKBSENT:
                error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
-                                           SECURITY_CAP_AUDIT, true);
+                                           CAP_OPT_NONE, true);
                break;
 
        /* default case assumes that the command will go
@@ -3617,7 +3595,7 @@ static void selinux_file_set_fowner(struct file *file)
 {
        struct file_security_struct *fsec;
 
-       fsec = file->f_security;
+       fsec = selinux_file(file);
        fsec->fown_sid = current_sid();
 }
 
@@ -3632,7 +3610,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
        /* struct fown_struct is never outside the context of a struct file */
        file = container_of(fown, struct file, f_owner);
 
-       fsec = file->f_security;
+       fsec = selinux_file(file);
 
        if (!signum)
                perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
@@ -3656,7 +3634,7 @@ static int selinux_file_open(struct file *file)
        struct file_security_struct *fsec;
        struct inode_security_struct *isec;
 
-       fsec = file->f_security;
+       fsec = selinux_file(file);
        isec = inode_security(file_inode(file));
        /*
         * Save inode label and policy sequence number
@@ -3689,53 +3667,16 @@ static int selinux_task_alloc(struct task_struct *task,
                            sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
 }
 
-/*
- * allocate the SELinux part of blank credentials
- */
-static int selinux_cred_alloc_blank(struct cred *cred, gfp_t gfp)
-{
-       struct task_security_struct *tsec;
-
-       tsec = kzalloc(sizeof(struct task_security_struct), gfp);
-       if (!tsec)
-               return -ENOMEM;
-
-       cred->security = tsec;
-       return 0;
-}
-
-/*
- * detach and free the LSM part of a set of credentials
- */
-static void selinux_cred_free(struct cred *cred)
-{
-       struct task_security_struct *tsec = cred->security;
-
-       /*
-        * cred->security == NULL if security_cred_alloc_blank() or
-        * security_prepare_creds() returned an error.
-        */
-       BUG_ON(cred->security && (unsigned long) cred->security < PAGE_SIZE);
-       cred->security = (void *) 0x7UL;
-       kfree(tsec);
-}
-
 /*
  * prepare a new set of credentials for modification
  */
 static int selinux_cred_prepare(struct cred *new, const struct cred *old,
                                gfp_t gfp)
 {
-       const struct task_security_struct *old_tsec;
-       struct task_security_struct *tsec;
-
-       old_tsec = old->security;
-
-       tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp);
-       if (!tsec)
-               return -ENOMEM;
+       const struct task_security_struct *old_tsec = selinux_cred(old);
+       struct task_security_struct *tsec = selinux_cred(new);
 
-       new->security = tsec;
+       *tsec = *old_tsec;
        return 0;
 }
 
@@ -3744,8 +3685,8 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old,
  */
 static void selinux_cred_transfer(struct cred *new, const struct cred *old)
 {
-       const struct task_security_struct *old_tsec = old->security;
-       struct task_security_struct *tsec = new->security;
+       const struct task_security_struct *old_tsec = selinux_cred(old);
+       struct task_security_struct *tsec = selinux_cred(new);
 
        *tsec = *old_tsec;
 }
@@ -3761,7 +3702,7 @@ static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
  */
 static int selinux_kernel_act_as(struct cred *new, u32 secid)
 {
-       struct task_security_struct *tsec = new->security;
+       struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;
 
@@ -3786,7 +3727,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid)
 static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
 {
        struct inode_security_struct *isec = inode_security(inode);
-       struct task_security_struct *tsec = new->security;
+       struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;
 
@@ -3832,7 +3773,7 @@ static int selinux_kernel_module_from_file(struct file *file)
        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;
 
-       fsec = file->f_security;
+       fsec = selinux_file(file);
        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
@@ -3998,7 +3939,7 @@ static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
 static void selinux_task_to_inode(struct task_struct *p,
                                  struct inode *inode)
 {
-       struct inode_security_struct *isec = inode->i_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = task_sid(p);
 
        spin_lock(&isec->lock);
@@ -4335,7 +4276,7 @@ static int sock_has_perm(struct sock *sk, u32 perms)
 static int selinux_socket_create(int family, int type,
                                 int protocol, int kern)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 newsid;
        u16 secclass;
        int rc;
@@ -4355,7 +4296,7 @@ static int selinux_socket_create(int family, int type,
 static int selinux_socket_post_create(struct socket *sock, int family,
                                      int type, int protocol, int kern)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
        struct sk_security_struct *sksec;
        u16 sclass = socket_type_to_security_class(family, type, protocol);
@@ -5236,7 +5177,7 @@ static int selinux_secmark_relabel_packet(u32 sid)
        const struct task_security_struct *__tsec;
        u32 tsid;
 
-       __tsec = current_security();
+       __tsec = selinux_cred(current_cred());
        tsid = __tsec->sid;
 
        return avc_has_perm(&selinux_state,
@@ -5711,51 +5652,22 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
        return selinux_nlmsg_perm(sk, skb);
 }
 
-static int ipc_alloc_security(struct kern_ipc_perm *perm,
-                             u16 sclass)
+static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
 {
-       struct ipc_security_struct *isec;
-
-       isec = kzalloc(sizeof(struct ipc_security_struct), GFP_KERNEL);
-       if (!isec)
-               return -ENOMEM;
-
        isec->sclass = sclass;
        isec->sid = current_sid();
-       perm->security = isec;
-
-       return 0;
-}
-
-static void ipc_free_security(struct kern_ipc_perm *perm)
-{
-       struct ipc_security_struct *isec = perm->security;
-       perm->security = NULL;
-       kfree(isec);
 }
 
 static int msg_msg_alloc_security(struct msg_msg *msg)
 {
        struct msg_security_struct *msec;
 
-       msec = kzalloc(sizeof(struct msg_security_struct), GFP_KERNEL);
-       if (!msec)
-               return -ENOMEM;
-
+       msec = selinux_msg_msg(msg);
        msec->sid = SECINITSID_UNLABELED;
-       msg->security = msec;
 
        return 0;
 }
 
-static void msg_msg_free_security(struct msg_msg *msg)
-{
-       struct msg_security_struct *msec = msg->security;
-
-       msg->security = NULL;
-       kfree(msec);
-}
-
 static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
                        u32 perms)
 {
@@ -5763,7 +5675,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
        struct common_audit_data ad;
        u32 sid = current_sid();
 
-       isec = ipc_perms->security;
+       isec = selinux_ipc(ipc_perms);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = ipc_perms->key;
@@ -5777,11 +5689,6 @@ static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
        return msg_msg_alloc_security(msg);
 }
 
-static void selinux_msg_msg_free_security(struct msg_msg *msg)
-{
-       msg_msg_free_security(msg);
-}
-
 /* message queue security operations */
 static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
 {
@@ -5790,11 +5697,8 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
        u32 sid = current_sid();
        int rc;
 
-       rc = ipc_alloc_security(msq, SECCLASS_MSGQ);
-       if (rc)
-               return rc;
-
-       isec = msq->security;
+       isec = selinux_ipc(msq);
+       ipc_init_security(isec, SECCLASS_MSGQ);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;
@@ -5802,16 +5706,7 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__CREATE, &ad);
-       if (rc) {
-               ipc_free_security(msq);
-               return rc;
-       }
-       return 0;
-}
-
-static void selinux_msg_queue_free_security(struct kern_ipc_perm *msq)
-{
-       ipc_free_security(msq);
+       return rc;
 }
 
 static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
@@ -5820,7 +5715,7 @@ static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
        struct common_audit_data ad;
        u32 sid = current_sid();
 
-       isec = msq->security;
+       isec = selinux_ipc(msq);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;
@@ -5869,8 +5764,8 @@ static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *m
        u32 sid = current_sid();
        int rc;
 
-       isec = msq->security;
-       msec = msg->security;
+       isec = selinux_ipc(msq);
+       msec = selinux_msg_msg(msg);
 
        /*
         * First time through, need to assign label to the message
@@ -5917,8 +5812,8 @@ static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *m
        u32 sid = task_sid(target);
        int rc;
 
-       isec = msq->security;
-       msec = msg->security;
+       isec = selinux_ipc(msq);
+       msec = selinux_msg_msg(msg);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;
@@ -5941,11 +5836,8 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
        u32 sid = current_sid();
        int rc;
 
-       rc = ipc_alloc_security(shp, SECCLASS_SHM);
-       if (rc)
-               return rc;
-
-       isec = shp->security;
+       isec = selinux_ipc(shp);
+       ipc_init_security(isec, SECCLASS_SHM);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;
@@ -5953,16 +5845,7 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SHM,
                          SHM__CREATE, &ad);
-       if (rc) {
-               ipc_free_security(shp);
-               return rc;
-       }
-       return 0;
-}
-
-static void selinux_shm_free_security(struct kern_ipc_perm *shp)
-{
-       ipc_free_security(shp);
+       return rc;
 }
 
 static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
@@ -5971,7 +5854,7 @@ static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
        struct common_audit_data ad;
        u32 sid = current_sid();
 
-       isec = shp->security;
+       isec = selinux_ipc(shp);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;
@@ -6038,11 +5921,8 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
        u32 sid = current_sid();
        int rc;
 
-       rc = ipc_alloc_security(sma, SECCLASS_SEM);
-       if (rc)
-               return rc;
-
-       isec = sma->security;
+       isec = selinux_ipc(sma);
+       ipc_init_security(isec, SECCLASS_SEM);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;
@@ -6050,16 +5930,7 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SEM,
                          SEM__CREATE, &ad);
-       if (rc) {
-               ipc_free_security(sma);
-               return rc;
-       }
-       return 0;
-}
-
-static void selinux_sem_free_security(struct kern_ipc_perm *sma)
-{
-       ipc_free_security(sma);
+       return rc;
 }
 
 static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
@@ -6068,7 +5939,7 @@ static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
        struct common_audit_data ad;
        u32 sid = current_sid();
 
-       isec = sma->security;
+       isec = selinux_ipc(sma);
 
        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;
@@ -6154,7 +6025,7 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 
 static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
 {
-       struct ipc_security_struct *isec = ipcp->security;
+       struct ipc_security_struct *isec = selinux_ipc(ipcp);
        *secid = isec->sid;
 }
 
@@ -6173,7 +6044,7 @@ static int selinux_getprocattr(struct task_struct *p,
        unsigned len;
 
        rcu_read_lock();
-       __tsec = __task_cred(p)->security;
+       __tsec = selinux_cred(__task_cred(p));
 
        if (current != p) {
                error = avc_has_perm(&selinux_state,
@@ -6296,7 +6167,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size)
           operation.  See selinux_bprm_set_creds for the execve
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
-       tsec = new->security;
+       tsec = selinux_cred(new);
        if (!strcmp(name, "exec")) {
                tsec->exec_sid = sid;
        } else if (!strcmp(name, "fscreate")) {
@@ -6380,7 +6251,7 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 
 static void selinux_inode_invalidate_secctx(struct inode *inode)
 {
-       struct inode_security_struct *isec = inode->i_security;
+       struct inode_security_struct *isec = selinux_inode(inode);
 
        spin_lock(&isec->lock);
        isec->initialized = LABEL_INVALID;
@@ -6392,7 +6263,10 @@ static void selinux_inode_invalidate_secctx(struct inode *inode)
  */
 static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
 {
-       return selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0);
+       int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
+                                          ctx, ctxlen, 0);
+       /* Do not return error when suppressing label (SBLABEL_MNT not set). */
+       return rc == -EOPNOTSUPP ? 0 : rc;
 }
 
 /*
@@ -6425,7 +6299,7 @@ static int selinux_key_alloc(struct key *k, const struct cred *cred,
        if (!ksec)
                return -ENOMEM;
 
-       tsec = cred->security;
+       tsec = selinux_cred(cred);
        if (tsec->keycreate_sid)
                ksec->sid = tsec->keycreate_sid;
        else
@@ -6688,6 +6562,14 @@ static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
 }
 #endif
 
+struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
+       .lbs_cred = sizeof(struct task_security_struct),
+       .lbs_file = sizeof(struct file_security_struct),
+       .lbs_inode = sizeof(struct inode_security_struct),
+       .lbs_ipc = sizeof(struct ipc_security_struct),
+       .lbs_msg_msg = sizeof(struct msg_security_struct),
+};
+
 static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
        LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -6757,7 +6639,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 
        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
-       LSM_HOOK_INIT(file_free_security, selinux_file_free_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
@@ -6771,8 +6652,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(file_open, selinux_file_open),
 
        LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
-       LSM_HOOK_INIT(cred_alloc_blank, selinux_cred_alloc_blank),
-       LSM_HOOK_INIT(cred_free, selinux_cred_free),
        LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
@@ -6800,24 +6679,20 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
 
        LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
-       LSM_HOOK_INIT(msg_msg_free_security, selinux_msg_msg_free_security),
 
        LSM_HOOK_INIT(msg_queue_alloc_security,
                        selinux_msg_queue_alloc_security),
-       LSM_HOOK_INIT(msg_queue_free_security, selinux_msg_queue_free_security),
        LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),
 
        LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
-       LSM_HOOK_INIT(shm_free_security, selinux_shm_free_security),
        LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),
 
        LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
-       LSM_HOOK_INIT(sem_free_security, selinux_sem_free_security),
        LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
        LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
        LSM_HOOK_INIT(sem_semop, selinux_sem_semop),
@@ -6928,16 +6803,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 
 static __init int selinux_init(void)
 {
-       if (!security_module_enable("selinux")) {
-               selinux_enabled = 0;
-               return 0;
-       }
-
-       if (!selinux_enabled) {
-               pr_info("SELinux:  Disabled at boot.\n");
-               return 0;
-       }
-
        pr_info("SELinux:  Initializing.\n");
 
        memset(&selinux_state, 0, sizeof(selinux_state));
@@ -6951,12 +6816,6 @@ static __init int selinux_init(void)
 
        default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);
 
-       sel_inode_cache = kmem_cache_create("selinux_inode_security",
-                                           sizeof(struct inode_security_struct),
-                                           0, SLAB_PANIC, NULL);
-       file_security_cache = kmem_cache_create("selinux_file_security",
-                                           sizeof(struct file_security_struct),
-                                           0, SLAB_PANIC, NULL);
        avc_init();
 
        avtab_cache_init();
@@ -6999,6 +6858,9 @@ void selinux_complete_init(void)
    all processes and objects when they are created. */
 DEFINE_LSM(selinux) = {
        .name = "selinux",
+       .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
+       .enabled = &selinux_enabled,
+       .blobs = &selinux_blob_sizes,
        .init = selinux_init,
 };
 
index 1bdf973433cc20c7d5dc74fe8172047ce965d48a..682e2b5de2a49cb350aff8d56361cb46c4e429fc 100644 (file)
@@ -1,9 +1,6 @@
 /*
  * SELinux support for the Audit LSM hooks
  *
- * Most of below header was moved from include/linux/selinux.h which
- * is released under below copyrights:
- *
  * Author: James Morris <jmorris@redhat.com>
  *
  * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -46,13 +43,11 @@ void selinux_audit_rule_free(void *rule);
  *     @field: the field this rule refers to
  *     @op: the operater the rule uses
  *     @rule: pointer to the audit rule to check against
- *     @actx: the audit context (can be NULL) associated with the check
  *
  *     Returns 1 if the context id matches the rule, 0 if it does not, and
  *     -errno on failure.
  */
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule,
-                            struct audit_context *actx);
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule);
 
 /**
  *     selinux_audit_rule_known - check to see if rule contains selinux fields.
index ef899bcfd2cb5eaf2afec5b599f29de35c1c2c8c..7be0e1e90e8be0fcf0f84e14367b7173d7f1dd90 100644 (file)
@@ -142,6 +142,7 @@ static inline int avc_audit(struct selinux_state *state,
 
 #define AVC_STRICT 1 /* Ignore permissive mode. */
 #define AVC_EXTENDED_PERMS 2   /* update extended permissions */
+#define AVC_NONBLOCKING    4   /* non blocking */
 int avc_has_perm_noaudit(struct selinux_state *state,
                         u32 ssid, u32 tsid,
                         u16 tclass, u32 requested,
@@ -152,11 +153,6 @@ int avc_has_perm(struct selinux_state *state,
                 u32 ssid, u32 tsid,
                 u16 tclass, u32 requested,
                 struct common_audit_data *auditdata);
-int avc_has_perm_flags(struct selinux_state *state,
-                      u32 ssid, u32 tsid,
-                      u16 tclass, u32 requested,
-                      struct common_audit_data *auditdata,
-                      int flags);
 
 int avc_has_extended_perms(struct selinux_state *state,
                           u32 ssid, u32 tsid, u16 tclass, u32 requested,
index cc5e26b0161b462108e6261a4e4a87cfafccf934..231262d8eac9189e8dec1576f9a25264d8633567 100644 (file)
@@ -25,6 +25,8 @@
 #include <linux/binfmts.h>
 #include <linux/in.h>
 #include <linux/spinlock.h>
+#include <linux/lsm_hooks.h>
+#include <linux/msg.h>
 #include <net/net_namespace.h>
 #include "flask.h"
 #include "avc.h"
@@ -56,10 +58,7 @@ enum label_initialized {
 
 struct inode_security_struct {
        struct inode *inode;    /* back pointer to inode object */
-       union {
-               struct list_head list;  /* list of inode_security_struct */
-               struct rcu_head rcu;    /* for freeing the inode_security_struct */
-       };
+       struct list_head list;  /* list of inode_security_struct */
        u32 task_sid;           /* SID of creating task */
        u32 sid;                /* SID of this object */
        u16 sclass;             /* security class of this object */
@@ -158,4 +157,35 @@ struct bpf_security_struct {
        u32 sid;  /*SID of bpf obj creater*/
 };
 
+extern struct lsm_blob_sizes selinux_blob_sizes;
+static inline struct task_security_struct *selinux_cred(const struct cred *cred)
+{
+       return cred->security + selinux_blob_sizes.lbs_cred;
+}
+
+static inline struct file_security_struct *selinux_file(const struct file *file)
+{
+       return file->f_security + selinux_blob_sizes.lbs_file;
+}
+
+static inline struct inode_security_struct *selinux_inode(
+                                               const struct inode *inode)
+{
+       if (unlikely(!inode->i_security))
+               return NULL;
+       return inode->i_security + selinux_blob_sizes.lbs_inode;
+}
+
+static inline struct msg_security_struct *selinux_msg_msg(
+                                               const struct msg_msg *msg_msg)
+{
+       return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
+}
+
+static inline struct ipc_security_struct *selinux_ipc(
+                                               const struct kern_ipc_perm *ipc)
+{
+       return ipc->security + selinux_blob_sizes.lbs_ipc;
+}
+
 #endif /* _SELINUX_OBJSEC_H_ */
index ba8eedf42b90a2daab57bb26595dade9df4bbfb0..f68fb25b57020eb0813e44d8b6f03371775fc1e7 100644 (file)
@@ -255,6 +255,9 @@ int security_sid_to_context(struct selinux_state *state, u32 sid,
 int security_sid_to_context_force(struct selinux_state *state,
                                  u32 sid, char **scontext, u32 *scontext_len);
 
+int security_sid_to_context_inval(struct selinux_state *state,
+                                 u32 sid, char **scontext, u32 *scontext_len);
+
 int security_context_to_sid(struct selinux_state *state,
                            const char *scontext, u32 scontext_len,
                            u32 *out_sid, gfp_t gfp);
index f3a5a138a096d8cb7d1d42cd23e07eecbbd4a2a0..145ee62f205a632a24e85d010e48dad600dfaa9a 100644 (file)
@@ -1378,7 +1378,7 @@ static int sel_make_bools(struct selinux_fs_info *fsi)
                        goto out;
                }
 
-               isec = (struct inode_security_struct *)inode->i_security;
+               isec = selinux_inode(inode);
                ret = security_genfs_sid(fsi->state, "selinuxfs", page,
                                         SECCLASS_FILE, &sid);
                if (ret) {
@@ -1953,7 +1953,7 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        inode->i_ino = ++fsi->last_ino;
-       isec = (struct inode_security_struct *)inode->i_security;
+       isec = selinux_inode(inode);
        isec->sid = SECINITSID_DEVNULL;
        isec->sclass = SECCLASS_CHR_FILE;
        isec->initialized = LABEL_INITIALIZED;
index dd44126c8d1445ab4f26e225a2fd8537c88bbf1a..1269e2be3c2d9ff1b03f21376c7354e2cb354b0c 100644 (file)
@@ -49,7 +49,6 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/mutex.h>
-#include <linux/selinux.h>
 #include <linux/flex_array.h>
 #include <linux/vmalloc.h>
 #include <net/netlabel.h>
@@ -1281,7 +1280,8 @@ const char *security_get_initial_sid_context(u32 sid)
 
 static int security_sid_to_context_core(struct selinux_state *state,
                                        u32 sid, char **scontext,
-                                       u32 *scontext_len, int force)
+                                       u32 *scontext_len, int force,
+                                       int only_invalid)
 {
        struct policydb *policydb;
        struct sidtab *sidtab;
@@ -1326,8 +1326,14 @@ static int security_sid_to_context_core(struct selinux_state *state,
                rc = -EINVAL;
                goto out_unlock;
        }
-       rc = context_struct_to_string(policydb, context, scontext,
-                                     scontext_len);
+       if (only_invalid && !context->len) {
+               scontext = NULL;
+               scontext_len = 0;
+               rc = 0;
+       } else {
+               rc = context_struct_to_string(policydb, context, scontext,
+                                             scontext_len);
+       }
 out_unlock:
        read_unlock(&state->ss->policy_rwlock);
 out:
@@ -1349,14 +1355,34 @@ int security_sid_to_context(struct selinux_state *state,
                            u32 sid, char **scontext, u32 *scontext_len)
 {
        return security_sid_to_context_core(state, sid, scontext,
-                                           scontext_len, 0);
+                                           scontext_len, 0, 0);
 }
 
 int security_sid_to_context_force(struct selinux_state *state, u32 sid,
                                  char **scontext, u32 *scontext_len)
 {
        return security_sid_to_context_core(state, sid, scontext,
-                                           scontext_len, 1);
+                                           scontext_len, 1, 0);
+}
+
+/**
+ * security_sid_to_context_inval - Obtain a context for a given SID if it
+ *                                 is invalid.
+ * @sid: security identifier, SID
+ * @scontext: security context
+ * @scontext_len: length in bytes
+ *
+ * Write the string representation of the context associated with @sid
+ * into a dynamically allocated string of the correct size, but only if the
+ * context is invalid in the current policy.  Set @scontext to point to
+ * this string (or NULL if the context is valid) and set @scontext_len to
+ * the length of the string (or 0 if the context is valid).
+ */
+int security_sid_to_context_inval(struct selinux_state *state, u32 sid,
+                                 char **scontext, u32 *scontext_len)
+{
+       return security_sid_to_context_core(state, sid, scontext,
+                                           scontext_len, 1, 1);
 }
 
 /*
@@ -3376,8 +3402,7 @@ int selinux_audit_rule_known(struct audit_krule *rule)
        return 0;
 }
 
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
-                            struct audit_context *actx)
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
 {
        struct selinux_state *state = &selinux_state;
        struct context *ctxt;
index bd7d18bdb147a8cad2003a41db60a94a19444fb4..7c57cb7e4146c9b56d7dd40a086c3eaecf5fca77 100644 (file)
@@ -79,7 +79,7 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp,
                                   gfp_t gfp)
 {
        int rc;
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct xfrm_sec_ctx *ctx = NULL;
        u32 str_len;
 
@@ -138,7 +138,7 @@ static void selinux_xfrm_free(struct xfrm_sec_ctx *ctx)
  */
 static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx)
 {
-       const struct task_security_struct *tsec = current_security();
+       const struct task_security_struct *tsec = selinux_cred(current_cred());
 
        if (!ctx)
                return 0;
index f7db791fb5660ad14479af3d4b48e104d8bc37ed..9c7c95a5c4974c73ee48007ebc49ac47d560d6d4 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/lsm_audit.h>
+#include <linux/msg.h>
 
 /*
  * Use IPv6 port labeling if IPv6 is enabled and secmarks
@@ -336,6 +337,7 @@ extern struct smack_known *smack_syslog_label;
 extern struct smack_known *smack_unconfined;
 #endif
 extern int smack_ptrace_rule;
+extern struct lsm_blob_sizes smack_blob_sizes;
 
 extern struct smack_known smack_known_floor;
 extern struct smack_known smack_known_hat;
@@ -356,12 +358,38 @@ extern struct list_head smack_onlycap_list;
 #define SMACK_HASH_SLOTS 16
 extern struct hlist_head smack_known_hash[SMACK_HASH_SLOTS];
 
+static inline struct task_smack *smack_cred(const struct cred *cred)
+{
+       return cred->security + smack_blob_sizes.lbs_cred;
+}
+
+static inline struct smack_known **smack_file(const struct file *file)
+{
+       return (struct smack_known **)(file->f_security +
+                                      smack_blob_sizes.lbs_file);
+}
+
+static inline struct inode_smack *smack_inode(const struct inode *inode)
+{
+       return inode->i_security + smack_blob_sizes.lbs_inode;
+}
+
+static inline struct smack_known **smack_msg_msg(const struct msg_msg *msg)
+{
+       return msg->security + smack_blob_sizes.lbs_msg_msg;
+}
+
+static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
+{
+       return ipc->security + smack_blob_sizes.lbs_ipc;
+}
+
 /*
  * Is the directory transmuting?
  */
 static inline int smk_inode_transmutable(const struct inode *isp)
 {
-       struct inode_smack *sip = isp->i_security;
+       struct inode_smack *sip = smack_inode(isp);
        return (sip->smk_flags & SMK_INODE_TRANSMUTE) != 0;
 }
 
@@ -370,7 +398,7 @@ static inline int smk_inode_transmutable(const struct inode *isp)
  */
 static inline struct smack_known *smk_of_inode(const struct inode *isp)
 {
-       struct inode_smack *sip = isp->i_security;
+       struct inode_smack *sip = smack_inode(isp);
        return sip->smk_inode;
 }
 
@@ -382,13 +410,19 @@ static inline struct smack_known *smk_of_task(const struct task_smack *tsp)
        return tsp->smk_task;
 }
 
-static inline struct smack_known *smk_of_task_struct(const struct task_struct *t)
+static inline struct smack_known *smk_of_task_struct(
+                                               const struct task_struct *t)
 {
        struct smack_known *skp;
+       const struct cred *cred;
 
        rcu_read_lock();
-       skp = smk_of_task(__task_cred(t)->security);
+
+       cred = __task_cred(t);
+       skp = smk_of_task(smack_cred(cred));
+
        rcu_read_unlock();
+
        return skp;
 }
 
@@ -405,7 +439,7 @@ static inline struct smack_known *smk_of_forked(const struct task_smack *tsp)
  */
 static inline struct smack_known *smk_of_current(void)
 {
-       return smk_of_task(current_security());
+       return smk_of_task(smack_cred(current_cred()));
 }
 
 /*
index 9a4c0ad46518d12d38564a703fdc51f3b202a5ae..fe2ce3a658220b6ba5810ef11952960667eac1ce 100644 (file)
@@ -275,7 +275,7 @@ out_audit:
 int smk_curacc(struct smack_known *obj_known,
               u32 mode, struct smk_audit_info *a)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_tskacc(tsp, obj_known, mode, a);
 }
@@ -635,12 +635,12 @@ DEFINE_MUTEX(smack_onlycap_lock);
  */
 bool smack_privileged_cred(int cap, const struct cred *cred)
 {
-       struct task_smack *tsp = cred->security;
+       struct task_smack *tsp = smack_cred(cred);
        struct smack_known *skp = tsp->smk_task;
        struct smack_known_list_elem *sklep;
        int rc;
 
-       rc = cap_capable(cred, &init_user_ns, cap, SECURITY_CAP_AUDIT);
+       rc = cap_capable(cred, &init_user_ns, cap, CAP_OPT_NONE);
        if (rc)
                return false;
 
index 430d4f35e55c0f2a77cfd81c3f3685b8c8390500..424bce4ef21df19718587e544863de168b9a0057 100644 (file)
@@ -139,7 +139,7 @@ static int smk_bu_note(char *note, struct smack_known *sskp,
 static int smk_bu_current(char *note, struct smack_known *oskp,
                          int mode, int rc)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
        char acc[SMK_NUM_ACCESS_TYPE + 1];
 
        if (rc <= 0)
@@ -160,7 +160,7 @@ static int smk_bu_current(char *note, struct smack_known *oskp,
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
 static int smk_bu_task(struct task_struct *otp, int mode, int rc)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
        struct smack_known *smk_task = smk_of_task_struct(otp);
        char acc[SMK_NUM_ACCESS_TYPE + 1];
 
@@ -182,8 +182,8 @@ static int smk_bu_task(struct task_struct *otp, int mode, int rc)
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
 static int smk_bu_inode(struct inode *inode, int mode, int rc)
 {
-       struct task_smack *tsp = current_security();
-       struct inode_smack *isp = inode->i_security;
+       struct task_smack *tsp = smack_cred(current_cred());
+       struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];
 
        if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -212,10 +212,10 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc)
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
 static int smk_bu_file(struct file *file, int mode, int rc)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
        struct smack_known *sskp = tsp->smk_task;
        struct inode *inode = file_inode(file);
-       struct inode_smack *isp = inode->i_security;
+       struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];
 
        if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -242,10 +242,10 @@ static int smk_bu_file(struct file *file, int mode, int rc)
 static int smk_bu_credfile(const struct cred *cred, struct file *file,
                                int mode, int rc)
 {
-       struct task_smack *tsp = cred->security;
+       struct task_smack *tsp = smack_cred(cred);
        struct smack_known *sskp = tsp->smk_task;
        struct inode *inode = file_inode(file);
-       struct inode_smack *isp = inode->i_security;
+       struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];
 
        if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -305,50 +305,35 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip,
 }
 
 /**
- * new_inode_smack - allocate an inode security blob
+ * init_inode_smack - initialize an inode security blob
+ * @isp: the blob to initialize
  * @skp: a pointer to the Smack label entry to use in the blob
  *
- * Returns the new blob or NULL if there's no memory available
  */
-static struct inode_smack *new_inode_smack(struct smack_known *skp)
+static void init_inode_smack(struct inode *inode, struct smack_known *skp)
 {
-       struct inode_smack *isp;
-
-       isp = kmem_cache_zalloc(smack_inode_cache, GFP_NOFS);
-       if (isp == NULL)
-               return NULL;
+       struct inode_smack *isp = smack_inode(inode);
 
        isp->smk_inode = skp;
        isp->smk_flags = 0;
        mutex_init(&isp->smk_lock);
-
-       return isp;
 }
 
 /**
- * new_task_smack - allocate a task security blob
+ * init_task_smack - initialize a task security blob
+ * @tsp: blob to initialize
  * @task: a pointer to the Smack label for the running task
  * @forked: a pointer to the Smack label for the forked task
- * @gfp: type of the memory for the allocation
  *
- * Returns the new blob or NULL if there's no memory available
  */
-static struct task_smack *new_task_smack(struct smack_known *task,
-                                       struct smack_known *forked, gfp_t gfp)
+static void init_task_smack(struct task_smack *tsp, struct smack_known *task,
+                                       struct smack_known *forked)
 {
-       struct task_smack *tsp;
-
-       tsp = kzalloc(sizeof(struct task_smack), gfp);
-       if (tsp == NULL)
-               return NULL;
-
        tsp->smk_task = task;
        tsp->smk_forked = forked;
        INIT_LIST_HEAD(&tsp->smk_rules);
        INIT_LIST_HEAD(&tsp->smk_relabel);
        mutex_init(&tsp->smk_rules_lock);
-
-       return tsp;
 }
 
 /**
@@ -448,7 +433,7 @@ static int smk_ptrace_rule_check(struct task_struct *tracer,
 
        rcu_read_lock();
        tracercred = __task_cred(tracer);
-       tsp = tracercred->security;
+       tsp = smack_cred(tracercred);
        tracer_known = smk_of_task(tsp);
 
        if ((mode & PTRACE_MODE_ATTACH) &&
@@ -515,7 +500,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
        int rc;
        struct smack_known *skp;
 
-       skp = smk_of_task(current_security());
+       skp = smk_of_task(smack_cred(current_cred()));
 
        rc = smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
        return rc;
@@ -718,6 +703,13 @@ static int smack_set_mnt_opts(struct super_block *sb,
        if (sp->smk_flags & SMK_SB_INITIALIZED)
                return 0;
 
+       if (inode->i_security == NULL) {
+               int rc = lsm_inode_alloc(inode);
+
+               if (rc)
+                       return rc;
+       }
+
        if (!smack_privileged(CAP_MAC_ADMIN)) {
                /*
                 * Unprivileged mounts don't get to specify Smack values.
@@ -782,17 +774,12 @@ static int smack_set_mnt_opts(struct super_block *sb,
        /*
         * Initialize the root inode.
         */
-       isp = inode->i_security;
-       if (isp == NULL) {
-               isp = new_inode_smack(sp->smk_root);
-               if (isp == NULL)
-                       return -ENOMEM;
-               inode->i_security = isp;
-       } else
-               isp->smk_inode = sp->smk_root;
+       init_inode_smack(inode, sp->smk_root);
 
-       if (transmute)
+       if (transmute) {
+               isp = smack_inode(inode);
                isp->smk_flags |= SMK_INODE_TRANSMUTE;
+       }
 
        return 0;
 }
@@ -831,7 +818,7 @@ static int smack_sb_statfs(struct dentry *dentry)
 static int smack_bprm_set_creds(struct linux_binprm *bprm)
 {
        struct inode *inode = file_inode(bprm->file);
-       struct task_smack *bsp = bprm->cred->security;
+       struct task_smack *bsp = smack_cred(bprm->cred);
        struct inode_smack *isp;
        struct superblock_smack *sbsp;
        int rc;
@@ -839,7 +826,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
        if (bprm->called_set_creds)
                return 0;
 
-       isp = inode->i_security;
+       isp = smack_inode(inode);
        if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
                return 0;
 
@@ -890,48 +877,10 @@ static int smack_inode_alloc_security(struct inode *inode)
 {
        struct smack_known *skp = smk_of_current();
 
-       inode->i_security = new_inode_smack(skp);
-       if (inode->i_security == NULL)
-               return -ENOMEM;
+       init_inode_smack(inode, skp);
        return 0;
 }
 
-/**
- * smack_inode_free_rcu - Free inode_smack blob from cache
- * @head: the rcu_head for getting inode_smack pointer
- *
- *  Call back function called from call_rcu() to free
- *  the i_security blob pointer in inode
- */
-static void smack_inode_free_rcu(struct rcu_head *head)
-{
-       struct inode_smack *issp;
-
-       issp = container_of(head, struct inode_smack, smk_rcu);
-       kmem_cache_free(smack_inode_cache, issp);
-}
-
-/**
- * smack_inode_free_security - free an inode blob using call_rcu()
- * @inode: the inode with a blob
- *
- * Clears the blob pointer in inode using RCU
- */
-static void smack_inode_free_security(struct inode *inode)
-{
-       struct inode_smack *issp = inode->i_security;
-
-       /*
-        * The inode may still be referenced in a path walk and
-        * a call to smack_inode_permission() can be made
-        * after smack_inode_free_security() is called.
-        * To avoid race condition free the i_security via RCU
-        * and leave the current inode->i_security pointer intact.
-        * The inode will be freed after the RCU grace period too.
-        */
-       call_rcu(&issp->smk_rcu, smack_inode_free_rcu);
-}
-
 /**
  * smack_inode_init_security - copy out the smack from an inode
  * @inode: the newly created inode
@@ -947,7 +896,7 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
                                     const struct qstr *qstr, const char **name,
                                     void **value, size_t *len)
 {
-       struct inode_smack *issp = inode->i_security;
+       struct inode_smack *issp = smack_inode(inode);
        struct smack_known *skp = smk_of_current();
        struct smack_known *isp = smk_of_inode(inode);
        struct smack_known *dsp = smk_of_inode(dir);
@@ -1285,7 +1234,7 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
                                      const void *value, size_t size, int flags)
 {
        struct smack_known *skp;
-       struct inode_smack *isp = d_backing_inode(dentry)->i_security;
+       struct inode_smack *isp = smack_inode(d_backing_inode(dentry));
 
        if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
                isp->smk_flags |= SMK_INODE_TRANSMUTE;
@@ -1366,7 +1315,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
        if (rc != 0)
                return rc;
 
-       isp = d_backing_inode(dentry)->i_security;
+       isp = smack_inode(d_backing_inode(dentry));
        /*
         * Don't do anything special for these.
         *      XATTR_NAME_SMACKIPIN
@@ -1498,24 +1447,12 @@ static void smack_inode_getsecid(struct inode *inode, u32 *secid)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-       struct smack_known *skp = smk_of_current();
+       struct smack_known **blob = smack_file(file);
 
-       file->f_security = skp;
+       *blob = smk_of_current();
        return 0;
 }
 
-/**
- * smack_file_free_security - clear a file security blob
- * @file: the object
- *
- * The security blob for a file is a pointer to the master
- * label list, so no memory is freed.
- */
-static void smack_file_free_security(struct file *file)
-{
-       file->f_security = NULL;
-}
-
 /**
  * smack_file_ioctl - Smack check on ioctls
  * @file: the object
@@ -1653,7 +1590,7 @@ static int smack_mmap_file(struct file *file,
        if (unlikely(IS_PRIVATE(file_inode(file))))
                return 0;
 
-       isp = file_inode(file)->i_security;
+       isp = smack_inode(file_inode(file));
        if (isp->smk_mmap == NULL)
                return 0;
        sbsp = file_inode(file)->i_sb->s_security;
@@ -1662,7 +1599,7 @@ static int smack_mmap_file(struct file *file,
                return -EACCES;
        mkp = isp->smk_mmap;
 
-       tsp = current_security();
+       tsp = smack_cred(current_cred());
        skp = smk_of_current();
        rc = 0;
 
@@ -1740,7 +1677,9 @@ static int smack_mmap_file(struct file *file,
  */
 static void smack_file_set_fowner(struct file *file)
 {
-       file->f_security = smk_of_current();
+       struct smack_known **blob = smack_file(file);
+
+       *blob = smk_of_current();
 }
 
 /**
@@ -1757,8 +1696,9 @@ static void smack_file_set_fowner(struct file *file)
 static int smack_file_send_sigiotask(struct task_struct *tsk,
                                     struct fown_struct *fown, int signum)
 {
+       struct smack_known **blob;
        struct smack_known *skp;
-       struct smack_known *tkp = smk_of_task(tsk->cred->security);
+       struct smack_known *tkp = smk_of_task(smack_cred(tsk->cred));
        const struct cred *tcred;
        struct file *file;
        int rc;
@@ -1770,7 +1710,8 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
        file = container_of(fown, struct file, f_owner);
 
        /* we don't log here as rc can be overriden */
-       skp = file->f_security;
+       blob = smack_file(file);
+       skp = *blob;
        rc = smk_access(skp, tkp, MAY_DELIVER, NULL);
        rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc);
 
@@ -1811,7 +1752,7 @@ static int smack_file_receive(struct file *file)
        if (inode->i_sb->s_magic == SOCKFS_MAGIC) {
                sock = SOCKET_I(inode);
                ssp = sock->sk->sk_security;
-               tsp = current_security();
+               tsp = smack_cred(current_cred());
                /*
                 * If the receiving process can't write to the
                 * passed socket or if the passed socket can't
@@ -1853,7 +1794,7 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_file_open(struct file *file)
 {
-       struct task_smack *tsp = file->f_cred->security;
+       struct task_smack *tsp = smack_cred(file->f_cred);
        struct inode *inode = file_inode(file);
        struct smk_audit_info ad;
        int rc;
@@ -1881,14 +1822,7 @@ static int smack_file_open(struct file *file)
  */
 static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
-       struct task_smack *tsp;
-
-       tsp = new_task_smack(NULL, NULL, gfp);
-       if (tsp == NULL)
-               return -ENOMEM;
-
-       cred->security = tsp;
-
+       init_task_smack(smack_cred(cred), NULL, NULL);
        return 0;
 }
 
@@ -1900,15 +1834,11 @@ static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
  */
 static void smack_cred_free(struct cred *cred)
 {
-       struct task_smack *tsp = cred->security;
+       struct task_smack *tsp = smack_cred(cred);
        struct smack_rule *rp;
        struct list_head *l;
        struct list_head *n;
 
-       if (tsp == NULL)
-               return;
-       cred->security = NULL;
-
        smk_destroy_label_list(&tsp->smk_relabel);
 
        list_for_each_safe(l, n, &tsp->smk_rules) {
@@ -1916,7 +1846,6 @@ static void smack_cred_free(struct cred *cred)
                list_del(&rp->list);
                kfree(rp);
        }
-       kfree(tsp);
 }
 
 /**
@@ -1930,15 +1859,11 @@ static void smack_cred_free(struct cred *cred)
 static int smack_cred_prepare(struct cred *new, const struct cred *old,
                              gfp_t gfp)
 {
-       struct task_smack *old_tsp = old->security;
-       struct task_smack *new_tsp;
+       struct task_smack *old_tsp = smack_cred(old);
+       struct task_smack *new_tsp = smack_cred(new);
        int rc;
 
-       new_tsp = new_task_smack(old_tsp->smk_task, old_tsp->smk_task, gfp);
-       if (new_tsp == NULL)
-               return -ENOMEM;
-
-       new->security = new_tsp;
+       init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task);
 
        rc = smk_copy_rules(&new_tsp->smk_rules, &old_tsp->smk_rules, gfp);
        if (rc != 0)
@@ -1946,10 +1871,7 @@ static int smack_cred_prepare(struct cred *new, const struct cred *old,
 
        rc = smk_copy_relabel(&new_tsp->smk_relabel, &old_tsp->smk_relabel,
                                gfp);
-       if (rc != 0)
-               return rc;
-
-       return 0;
+       return rc;
 }
 
 /**
@@ -1961,15 +1883,14 @@ static int smack_cred_prepare(struct cred *new, const struct cred *old,
  */
 static void smack_cred_transfer(struct cred *new, const struct cred *old)
 {
-       struct task_smack *old_tsp = old->security;
-       struct task_smack *new_tsp = new->security;
+       struct task_smack *old_tsp = smack_cred(old);
+       struct task_smack *new_tsp = smack_cred(new);
 
        new_tsp->smk_task = old_tsp->smk_task;
        new_tsp->smk_forked = old_tsp->smk_task;
        mutex_init(&new_tsp->smk_rules_lock);
        INIT_LIST_HEAD(&new_tsp->smk_rules);
 
-
        /* cbs copy rule list */
 }
 
@@ -1980,12 +1901,12 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old)
  *
  * Sets the secid to contain a u32 version of the smack label.
  */
-static void smack_cred_getsecid(const struct cred *c, u32 *secid)
+static void smack_cred_getsecid(const struct cred *cred, u32 *secid)
 {
        struct smack_known *skp;
 
        rcu_read_lock();
-       skp = smk_of_task(c->security);
+       skp = smk_of_task(smack_cred(cred));
        *secid = skp->smk_secid;
        rcu_read_unlock();
 }
@@ -1999,7 +1920,7 @@ static void smack_cred_getsecid(const struct cred *c, u32 *secid)
  */
 static int smack_kernel_act_as(struct cred *new, u32 secid)
 {
-       struct task_smack *new_tsp = new->security;
+       struct task_smack *new_tsp = smack_cred(new);
 
        new_tsp->smk_task = smack_from_secid(secid);
        return 0;
@@ -2016,8 +1937,8 @@ static int smack_kernel_act_as(struct cred *new, u32 secid)
 static int smack_kernel_create_files_as(struct cred *new,
                                        struct inode *inode)
 {
-       struct inode_smack *isp = inode->i_security;
-       struct task_smack *tsp = new->security;
+       struct inode_smack *isp = smack_inode(inode);
+       struct task_smack *tsp = smack_cred(new);
 
        tsp->smk_forked = isp->smk_inode;
        tsp->smk_task = tsp->smk_forked;
@@ -2201,7 +2122,7 @@ static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info,
         * specific behavior. This is not clean. For one thing
         * we can't take privilege into account.
         */
-       skp = smk_of_task(cred->security);
+       skp = smk_of_task(smack_cred(cred));
        rc = smk_access(skp, tkp, MAY_DELIVER, &ad);
        rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc);
        return rc;
@@ -2216,7 +2137,7 @@ static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info,
  */
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
-       struct inode_smack *isp = inode->i_security;
+       struct inode_smack *isp = smack_inode(inode);
        struct smack_known *skp = smk_of_task_struct(p);
 
        isp->smk_inode = skp;
@@ -2679,7 +2600,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
                                   const void *value, size_t size, int flags)
 {
        struct smack_known *skp;
-       struct inode_smack *nsp = inode->i_security;
+       struct inode_smack *nsp = smack_inode(inode);
        struct socket_smack *ssp;
        struct socket *sock;
        int rc = 0;
@@ -2888,23 +2809,12 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-       struct smack_known *skp = smk_of_current();
+       struct smack_known **blob = smack_msg_msg(msg);
 
-       msg->security = skp;
+       *blob = smk_of_current();
        return 0;
 }
 
-/**
- * smack_msg_msg_free_security - Clear the security blob for msg_msg
- * @msg: the object
- *
- * Clears the blob pointer
- */
-static void smack_msg_msg_free_security(struct msg_msg *msg)
-{
-       msg->security = NULL;
-}
-
 /**
  * smack_of_ipc - the smack pointer for the ipc
  * @isp: the object
@@ -2913,7 +2823,9 @@ static void smack_msg_msg_free_security(struct msg_msg *msg)
  */
 static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp)
 {
-       return (struct smack_known *)isp->security;
+       struct smack_known **blob = smack_ipc(isp);
+
+       return *blob;
 }
 
 /**
@@ -2924,23 +2836,12 @@ static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp)
  */
 static int smack_ipc_alloc_security(struct kern_ipc_perm *isp)
 {
-       struct smack_known *skp = smk_of_current();
+       struct smack_known **blob = smack_ipc(isp);
 
-       isp->security = skp;
+       *blob = smk_of_current();
        return 0;
 }
 
-/**
- * smack_ipc_free_security - Clear the security blob for ipc
- * @isp: the object
- *
- * Clears the blob pointer
- */
-static void smack_ipc_free_security(struct kern_ipc_perm *isp)
-{
-       isp->security = NULL;
-}
-
 /**
  * smk_curacc_shm : check if current has access on shm
  * @isp : the object
@@ -3238,7 +3139,8 @@ static int smack_msg_queue_msgrcv(struct kern_ipc_perm *isp, struct msg_msg *msg
  */
 static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 {
-       struct smack_known *iskp = ipp->security;
+       struct smack_known **blob = smack_ipc(ipp);
+       struct smack_known *iskp = *blob;
        int may = smack_flags_to_may(flag);
        struct smk_audit_info ad;
        int rc;
@@ -3259,7 +3161,8 @@ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
  */
 static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
 {
-       struct smack_known *iskp = ipp->security;
+       struct smack_known **blob = smack_ipc(ipp);
+       struct smack_known *iskp = *blob;
 
        *secid = iskp->smk_secid;
 }
@@ -3287,7 +3190,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
        if (inode == NULL)
                return;
 
-       isp = inode->i_security;
+       isp = smack_inode(inode);
 
        mutex_lock(&isp->smk_lock);
        /*
@@ -3390,13 +3293,12 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
                 */
                final = &smack_known_star;
                /*
-                * Fall through.
-                *
                 * If a smack value has been set we want to use it,
                 * but since tmpfs isn't giving us the opportunity
                 * to set mount options simulate setting the
                 * superblock default.
                 */
+               /* Fall through */
        default:
                /*
                 * This isn't an understood special case.
@@ -3528,7 +3430,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
  */
 static int smack_setprocattr(const char *name, void *value, size_t size)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
        struct cred *new;
        struct smack_known *skp;
        struct smack_known_list_elem *sklep;
@@ -3569,7 +3471,7 @@ static int smack_setprocattr(const char *name, void *value, size_t size)
        if (new == NULL)
                return -ENOMEM;
 
-       tsp = new->security;
+       tsp = smack_cred(new);
        tsp->smk_task = skp;
        /*
         * process can change its label only once
@@ -4214,7 +4116,7 @@ static void smack_inet_csk_clone(struct sock *sk,
 static int smack_key_alloc(struct key *key, const struct cred *cred,
                           unsigned long flags)
 {
-       struct smack_known *skp = smk_of_task(cred->security);
+       struct smack_known *skp = smk_of_task(smack_cred(cred));
 
        key->security = skp;
        return 0;
@@ -4245,7 +4147,7 @@ static int smack_key_permission(key_ref_t key_ref,
 {
        struct key *keyp;
        struct smk_audit_info ad;
-       struct smack_known *tkp = smk_of_task(cred->security);
+       struct smack_known *tkp = smk_of_task(smack_cred(cred));
        int request = 0;
        int rc;
 
@@ -4393,13 +4295,11 @@ static int smack_audit_rule_known(struct audit_krule *krule)
  * @field: audit rule flags given from user-space
  * @op: required testing operator
  * @vrule: smack internal rule presentation
- * @actx: audit context associated with the check
  *
  * The core Audit hook. It's used to take the decision of
  * whether to audit or not to audit a given object.
  */
-static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule,
-                                 struct audit_context *actx)
+static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
 {
        struct smack_known *skp;
        char *rule = vrule;
@@ -4520,12 +4420,12 @@ static int smack_inode_copy_up(struct dentry *dentry, struct cred **new)
                        return -ENOMEM;
        }
 
-       tsp = new_creds->security;
+       tsp = smack_cred(new_creds);
 
        /*
         * Get label from overlay inode and set it in create_sid
         */
-       isp = d_inode(dentry->d_parent)->i_security;
+       isp = smack_inode(d_inode(dentry->d_parent));
        skp = isp->smk_inode;
        tsp->smk_task = skp;
        *new = new_creds;
@@ -4548,8 +4448,8 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
                                        const struct cred *old,
                                        struct cred *new)
 {
-       struct task_smack *otsp = old->security;
-       struct task_smack *ntsp = new->security;
+       struct task_smack *otsp = smack_cred(old);
+       struct task_smack *ntsp = smack_cred(new);
        struct inode_smack *isp;
        int may;
 
@@ -4562,7 +4462,7 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
        /*
         * the attribute of the containing directory
         */
-       isp = d_inode(dentry->d_parent)->i_security;
+       isp = smack_inode(d_inode(dentry->d_parent));
 
        if (isp->smk_flags & SMK_INODE_TRANSMUTE) {
                rcu_read_lock();
@@ -4582,6 +4482,14 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
        return 0;
 }
 
+struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
+       .lbs_cred = sizeof(struct task_smack),
+       .lbs_file = sizeof(struct smack_known *),
+       .lbs_inode = sizeof(struct inode_smack),
+       .lbs_ipc = sizeof(struct smack_known *),
+       .lbs_msg_msg = sizeof(struct smack_known *),
+};
+
 static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
@@ -4597,7 +4505,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds),
 
        LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security),
-       LSM_HOOK_INIT(inode_free_security, smack_inode_free_security),
        LSM_HOOK_INIT(inode_init_security, smack_inode_init_security),
        LSM_HOOK_INIT(inode_link, smack_inode_link),
        LSM_HOOK_INIT(inode_unlink, smack_inode_unlink),
@@ -4616,7 +4523,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),
 
        LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
-       LSM_HOOK_INIT(file_free_security, smack_file_free_security),
        LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
        LSM_HOOK_INIT(file_lock, smack_file_lock),
        LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
@@ -4652,23 +4558,19 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),
 
        LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),
-       LSM_HOOK_INIT(msg_msg_free_security, smack_msg_msg_free_security),
 
        LSM_HOOK_INIT(msg_queue_alloc_security, smack_ipc_alloc_security),
-       LSM_HOOK_INIT(msg_queue_free_security, smack_ipc_free_security),
        LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv),
 
        LSM_HOOK_INIT(shm_alloc_security, smack_ipc_alloc_security),
-       LSM_HOOK_INIT(shm_free_security, smack_ipc_free_security),
        LSM_HOOK_INIT(shm_associate, smack_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, smack_shm_shmat),
 
        LSM_HOOK_INIT(sem_alloc_security, smack_ipc_alloc_security),
-       LSM_HOOK_INIT(sem_free_security, smack_ipc_free_security),
        LSM_HOOK_INIT(sem_associate, smack_sem_associate),
        LSM_HOOK_INIT(sem_semctl, smack_sem_semctl),
        LSM_HOOK_INIT(sem_semop, smack_sem_semop),
@@ -4759,23 +4661,23 @@ static __init void init_smack_known_list(void)
  */
 static __init int smack_init(void)
 {
-       struct cred *cred;
+       struct cred *cred = (struct cred *) current->cred;
        struct task_smack *tsp;
 
-       if (!security_module_enable("smack"))
-               return 0;
-
        smack_inode_cache = KMEM_CACHE(inode_smack, 0);
        if (!smack_inode_cache)
                return -ENOMEM;
 
-       tsp = new_task_smack(&smack_known_floor, &smack_known_floor,
-                               GFP_KERNEL);
-       if (tsp == NULL) {
-               kmem_cache_destroy(smack_inode_cache);
-               return -ENOMEM;
-       }
+       /*
+        * Set the security state for the initial task.
+        */
+       tsp = smack_cred(cred);
+       init_task_smack(tsp, &smack_known_floor, &smack_known_floor);
 
+       /*
+        * Register with LSM
+        */
+       security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
        smack_enabled = 1;
 
        pr_info("Smack:  Initializing.\n");
@@ -4789,20 +4691,9 @@ static __init int smack_init(void)
        pr_info("Smack:  IPv6 Netfilter enabled.\n");
 #endif
 
-       /*
-        * Set the security state for the initial task.
-        */
-       cred = (struct cred *) current->cred;
-       cred->security = tsp;
-
        /* initialize the smack_known_list */
        init_smack_known_list();
 
-       /*
-        * Register with LSM
-        */
-       security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
-
        return 0;
 }
 
@@ -4812,5 +4703,7 @@ static __init int smack_init(void)
  */
 DEFINE_LSM(smack) = {
        .name = "smack",
+       .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
+       .blobs = &smack_blob_sizes,
        .init = smack_init,
 };
index 06b517075ec06acad0b213d4aa27eec10986f436..faf2ea3968b33f9cc69b778100532c3f3bef1b47 100644 (file)
@@ -2208,14 +2208,14 @@ static const struct file_operations smk_logging_ops = {
 
 static void *load_self_seq_start(struct seq_file *s, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_start(s, pos, &tsp->smk_rules);
 }
 
 static void *load_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_next(s, v, pos, &tsp->smk_rules);
 }
@@ -2262,7 +2262,7 @@ static int smk_open_load_self(struct inode *inode, struct file *file)
 static ssize_t smk_write_load_self(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
                                    &tsp->smk_rules_lock, SMK_FIXED24_FMT);
@@ -2414,14 +2414,14 @@ static const struct file_operations smk_load2_ops = {
 
 static void *load_self2_seq_start(struct seq_file *s, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_start(s, pos, &tsp->smk_rules);
 }
 
 static void *load_self2_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_next(s, v, pos, &tsp->smk_rules);
 }
@@ -2467,7 +2467,7 @@ static int smk_open_load_self2(struct inode *inode, struct file *file)
 static ssize_t smk_write_load_self2(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
                                    &tsp->smk_rules_lock, SMK_LONG_FMT);
@@ -2681,14 +2681,14 @@ static const struct file_operations smk_syslog_ops = {
 
 static void *relabel_self_seq_start(struct seq_file *s, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_start(s, pos, &tsp->smk_relabel);
 }
 
 static void *relabel_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
 
        return smk_seq_next(s, v, pos, &tsp->smk_relabel);
 }
@@ -2736,7 +2736,7 @@ static int smk_open_relabel_self(struct inode *inode, struct file *file)
 static ssize_t smk_write_relabel_self(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
-       struct task_smack *tsp = current_security();
+       struct task_smack *tsp = smack_cred(current_cred());
        char *data;
        int rc;
        LIST_HEAD(list_tmp);
index 479b03a7a17effb75a81212f48fa16580dc7fc3a..3c96e8402e94d516f9665e4a544496c5ea87572e 100644 (file)
@@ -32,6 +32,7 @@ static char *tomoyo_print_bprm(struct linux_binprm *bprm,
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool truncated = false;
+
        if (!buffer)
                return NULL;
        len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
@@ -49,6 +50,7 @@ static char *tomoyo_print_bprm(struct linux_binprm *bprm,
                while (offset < PAGE_SIZE) {
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];
+
                        if (cp == last_start)
                                *cp++ = '"';
                        if (cp >= buffer + tomoyo_buffer_len - 32) {
@@ -154,19 +156,18 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
        char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
        int pos;
        u8 i;
+
        if (!buffer)
                return NULL;
 
        tomoyo_convert_time(ktime_get_real_seconds(), &stamp);
 
        pos = snprintf(buffer, tomoyo_buffer_len - 1,
-                      "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s "
-                      "granted=%s (global-pid=%u) task={ pid=%u ppid=%u "
-                      "uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u "
-                      "fsuid=%u fsgid=%u }", stamp.year, stamp.month,
-                      stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile,
-                      tomoyo_mode[r->mode], tomoyo_yesno(r->granted), gpid,
-                      tomoyo_sys_getpid(), tomoyo_sys_getppid(),
+                      "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
+                      stamp.year, stamp.month, stamp.day, stamp.hour,
+                      stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
+                      tomoyo_yesno(r->granted), gpid, tomoyo_sys_getpid(),
+                      tomoyo_sys_getppid(),
                       from_kuid(&init_user_ns, current_uid()),
                       from_kgid(&init_user_ns, current_gid()),
                       from_kuid(&init_user_ns, current_euid()),
@@ -185,6 +186,7 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
                struct tomoyo_mini_stat *stat;
                unsigned int dev;
                umode_t mode;
+
                if (!obj->stat_valid[i])
                        continue;
                stat = &obj->stat[i];
@@ -193,8 +195,8 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
                if (i & 1) {
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
-                                       " path%u.parent={ uid=%u gid=%u "
-                                       "ino=%lu perm=0%o }", (i >> 1) + 1,
+                                       " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
+                                       (i >> 1) + 1,
                                        from_kuid(&init_user_ns, stat->uid),
                                        from_kgid(&init_user_ns, stat->gid),
                                        (unsigned long)stat->ino,
@@ -202,8 +204,8 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
                        continue;
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
-                               " path%u={ uid=%u gid=%u ino=%lu major=%u"
-                               " minor=%u perm=0%o type=%s", (i >> 1) + 1,
+                               " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
+                               (i >> 1) + 1,
                                from_kuid(&init_user_ns, stat->uid),
                                from_kgid(&init_user_ns, stat->gid),
                                (unsigned long)stat->ino,
@@ -249,6 +251,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
        const char *symlink = NULL;
        int pos;
        const char *domainname = r->domain->domainname->name;
+
        header = tomoyo_print_header(r);
        if (!header)
                return NULL;
@@ -256,6 +259,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
        len += strlen(domainname) + strlen(header) + 10;
        if (r->ee) {
                struct file *file = r->ee->bprm->file;
+
                realpath = tomoyo_realpath_from_path(&file->f_path);
                bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
                if (!realpath || !bprm_info)
@@ -275,6 +279,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
        pos = snprintf(buf, len, "%s", header);
        if (realpath) {
                struct linux_binprm *bprm = r->ee->bprm;
+
                pos += snprintf(buf + pos, len - pos,
                                " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
                                realpath, bprm->argc, bprm->envc, bprm_info);
@@ -328,6 +333,7 @@ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
        const u8 category = tomoyo_index2category[index] +
                TOMOYO_MAX_MAC_INDEX;
        struct tomoyo_profile *p;
+
        if (!tomoyo_policy_loaded)
                return false;
        p = tomoyo_profile(ns, profile);
@@ -362,6 +368,7 @@ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
        char *buf;
        struct tomoyo_log *entry;
        bool quota_exceeded = false;
+
        if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
                              r->matched_acl, r->granted))
                goto out;
@@ -413,6 +420,7 @@ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
 {
        va_list args;
        int len;
+
        va_start(args, fmt);
        len = vsnprintf((char *) &len, 1, fmt, args) + 1;
        va_end(args);
@@ -431,6 +439,7 @@ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
 void tomoyo_read_log(struct tomoyo_io_buffer *head)
 {
        struct tomoyo_log *ptr = NULL;
+
        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
index c598aa00d5e3170206eca0914216eec52ee3fef9..57988d95d33de2050bacd73cfc6252cf8d836861 100644 (file)
@@ -197,6 +197,7 @@ static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
 {
        va_list args;
        const int pos = strlen(buffer);
+
        va_start(args, fmt);
        vsnprintf(buffer + pos, len - pos - 1, fmt, args);
        va_end(args);
@@ -214,6 +215,7 @@ static bool tomoyo_flush(struct tomoyo_io_buffer *head)
        while (head->r.w_pos) {
                const char *w = head->r.w[0];
                size_t len = strlen(w);
+
                if (len) {
                        if (len > head->read_user_buf_avail)
                                len = head->read_user_buf_avail;
@@ -279,6 +281,7 @@ static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
        size_t len;
        size_t pos = head->r.avail;
        int size = head->readbuf_size - pos;
+
        if (size <= 0)
                return;
        va_start(args, fmt);
@@ -344,13 +347,14 @@ static bool tomoyo_namespace_enabled;
 void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
 {
        unsigned int idx;
+
        for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
                INIT_LIST_HEAD(&ns->acl_group[idx]);
        for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
                INIT_LIST_HEAD(&ns->group_list[idx]);
        for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
                INIT_LIST_HEAD(&ns->policy_list[idx]);
-       ns->profile_version = 20110903;
+       ns->profile_version = 20150505;
        tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
        list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
 }
@@ -433,6 +437,7 @@ static void tomoyo_print_number_union_nospace
                u8 min_type = ptr->value_type[0];
                const u8 max_type = ptr->value_type[1];
                char buffer[128];
+
                buffer[0] = '\0';
                for (i = 0; i < 2; i++) {
                        switch (min_type) {
@@ -487,6 +492,7 @@ static struct tomoyo_profile *tomoyo_assign_profile
 {
        struct tomoyo_profile *ptr;
        struct tomoyo_profile *entry;
+
        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
        ptr = ns->profile_ptr[profile];
@@ -530,6 +536,7 @@ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
 {
        static struct tomoyo_profile tomoyo_null_profile;
        struct tomoyo_profile *ptr = ns->profile_ptr[profile];
+
        if (!ptr)
                ptr = &tomoyo_null_profile;
        return ptr;
@@ -546,6 +553,7 @@ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
 static s8 tomoyo_find_yesno(const char *string, const char *find)
 {
        const char *cp = strstr(string, find);
+
        if (cp) {
                cp += strlen(find);
                if (!strncmp(cp, "=yes", 4))
@@ -569,6 +577,7 @@ static void tomoyo_set_uint(unsigned int *i, const char *string,
                            const char *find)
 {
        const char *cp = strstr(string, find);
+
        if (cp)
                sscanf(cp + strlen(find), "=%u", i);
 }
@@ -587,6 +596,7 @@ static int tomoyo_set_mode(char *name, const char *value,
 {
        u8 i;
        u8 config;
+
        if (!strcmp(name, "CONFIG")) {
                i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
                config = profile->default_config;
@@ -595,10 +605,12 @@ static int tomoyo_set_mode(char *name, const char *value,
                for (i = 0; i < TOMOYO_MAX_MAC_INDEX
                             + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
                        int len = 0;
+
                        if (i < TOMOYO_MAX_MAC_INDEX) {
                                const u8 c = tomoyo_index2category[i];
                                const char *category =
                                        tomoyo_category_keywords[c];
+
                                len = strlen(category);
                                if (strncmp(name, category, len) ||
                                    name[len++] != ':' || name[len++] != ':')
@@ -618,6 +630,7 @@ static int tomoyo_set_mode(char *name, const char *value,
                config = TOMOYO_CONFIG_USE_DEFAULT;
        } else {
                u8 mode;
+
                for (mode = 0; mode < 4; mode++)
                        if (strstr(value, tomoyo_mode[mode]))
                                /*
@@ -664,6 +677,7 @@ static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
        unsigned int i;
        char *cp;
        struct tomoyo_profile *profile;
+
        if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
            == 1)
                return 0;
@@ -683,6 +697,7 @@ static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
                const struct tomoyo_path_info *new_comment
                        = tomoyo_get_name(cp);
                const struct tomoyo_path_info *old_comment;
+
                if (!new_comment)
                        return -ENOMEM;
                spin_lock(&lock);
@@ -732,6 +747,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        const struct tomoyo_profile *profile;
+
        if (head->r.eof)
                return;
  next:
@@ -760,6 +776,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
                        u8 i;
                        const struct tomoyo_path_info *comment =
                                profile->comment;
+
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-COMMENT=", index);
                        tomoyo_set_string(head, comment ? comment->name : "");
@@ -788,6 +805,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
                              + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
                        const u8 i = head->r.bit;
                        const u8 config = profile->config[i];
+
                        if (config == TOMOYO_CONFIG_USE_DEFAULT)
                                continue;
                        tomoyo_print_namespace(head);
@@ -847,10 +865,10 @@ static int tomoyo_update_manager_entry(const char *manager,
        struct tomoyo_acl_param param = {
                /* .ns = &tomoyo_kernel_namespace, */
                .is_delete = is_delete,
-               .list = &tomoyo_kernel_namespace.
-               policy_list[TOMOYO_ID_MANAGER],
+               .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
        };
        int error = is_delete ? -ENOENT : -ENOMEM;
+
        if (!tomoyo_correct_domain(manager) &&
            !tomoyo_correct_word(manager))
                return -EINVAL;
@@ -894,10 +912,10 @@ static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
 {
        if (head->r.eof)
                return;
-       list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.
-                            policy_list[TOMOYO_ID_MANAGER]) {
+       list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
                struct tomoyo_manager *ptr =
                        list_entry(head->r.acl, typeof(*ptr), head.list);
+
                if (ptr->head.is_deleted)
                        continue;
                if (!tomoyo_flush(head))
@@ -933,8 +951,7 @@ static bool tomoyo_manager(void)
        exe = tomoyo_get_exe();
        if (!exe)
                return false;
-       list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.
-                               policy_list[TOMOYO_ID_MANAGER], head.list) {
+       list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list) {
                if (!ptr->head.is_deleted &&
                    (!tomoyo_pathcmp(domainname, ptr->manager) ||
                     !strcmp(exe, ptr->manager->name))) {
@@ -945,9 +962,10 @@ static bool tomoyo_manager(void)
        if (!found) { /* Reduce error messages. */
                static pid_t last_pid;
                const pid_t pid = current->pid;
+
                if (last_pid != pid) {
-                       printk(KERN_WARNING "%s ( %s ) is not permitted to "
-                              "update policies.\n", domainname->name, exe);
+                       pr_warn("%s ( %s ) is not permitted to update policies.\n",
+                               domainname->name, exe);
                        last_pid = pid;
                }
        }
@@ -974,19 +992,21 @@ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
        unsigned int pid;
        struct tomoyo_domain_info *domain = NULL;
        bool global_pid = false;
+
        if (strncmp(data, "select ", 7))
                return false;
        data += 7;
        if (sscanf(data, "pid=%u", &pid) == 1 ||
            (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
                struct task_struct *p;
+
                rcu_read_lock();
                if (global_pid)
                        p = find_task_by_pid_ns(pid, &init_pid_ns);
                else
                        p = find_task_by_vpid(pid);
                if (p)
-                       domain = tomoyo_real_domain(p);
+                       domain = tomoyo_task(p)->domain_info;
                rcu_read_unlock();
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_domain_def(data + 7))
@@ -1020,10 +1040,11 @@ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
  * Returns true if @a == @b, false otherwise.
  */
 static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
-                             const struct tomoyo_acl_info *b)
+                                const struct tomoyo_acl_info *b)
 {
        const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);
+
        return p1->domainname == p2->domainname;
 }
 
@@ -1039,11 +1060,13 @@ static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
 static int tomoyo_write_task(struct tomoyo_acl_param *param)
 {
        int error = -EINVAL;
+
        if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
                struct tomoyo_task_acl e = {
                        .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
                        .domainname = tomoyo_get_domainname(param),
                };
+
                if (e.domainname)
                        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                                     tomoyo_same_task_acl,
@@ -1110,7 +1133,7 @@ static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
        };
        static const struct {
                const char *keyword;
-               int (*write) (struct tomoyo_acl_param *);
+               int (*write)(struct tomoyo_acl_param *param);
        } tomoyo_callback[5] = {
                { "file ", tomoyo_write_file },
                { "network inet ", tomoyo_write_inet_network },
@@ -1151,9 +1174,11 @@ static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
        struct tomoyo_domain_info *domain = head->w.domain;
        const bool is_delete = head->w.is_delete;
        bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
-       unsigned int profile;
+       unsigned int idx;
+
        if (*data == '<') {
                int ret = 0;
+
                domain = NULL;
                if (is_delete)
                        ret = tomoyo_delete_domain(data);
@@ -1167,23 +1192,27 @@ static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
        if (!domain)
                return -EINVAL;
        ns = domain->ns;
-       if (sscanf(data, "use_profile %u", &profile) == 1
-           && profile < TOMOYO_MAX_PROFILES) {
-               if (!tomoyo_policy_loaded || ns->profile_ptr[profile])
-                       domain->profile = (u8) profile;
+       if (sscanf(data, "use_profile %u", &idx) == 1
+           && idx < TOMOYO_MAX_PROFILES) {
+               if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
+                       if (!is_delete)
+                               domain->profile = (u8) idx;
                return 0;
        }
-       if (sscanf(data, "use_group %u\n", &profile) == 1
-           && profile < TOMOYO_MAX_ACL_GROUPS) {
+       if (sscanf(data, "use_group %u\n", &idx) == 1
+           && idx < TOMOYO_MAX_ACL_GROUPS) {
                if (!is_delete)
-                       domain->group = (u8) profile;
+                       set_bit(idx, domain->group);
+               else
+                       clear_bit(idx, domain->group);
                return 0;
        }
-       for (profile = 0; profile < TOMOYO_MAX_DOMAIN_INFO_FLAGS; profile++) {
-               const char *cp = tomoyo_dif[profile];
+       for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
+               const char *cp = tomoyo_dif[idx];
+
                if (strncmp(data, cp, strlen(cp) - 1))
                        continue;
-               domain->flags[profile] = !is_delete;
+               domain->flags[idx] = !is_delete;
                return 0;
        }
        return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
@@ -1225,9 +1254,11 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                        const struct tomoyo_envp *envp =
                                (typeof(envp)) (argv + cond->argc);
                        u16 skip;
+
                        for (skip = 0; skip < head->r.cond_index; skip++) {
                                const u8 left = condp->left;
                                const u8 right = condp->right;
+
                                condp++;
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
@@ -1253,6 +1284,7 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                const u8 match = condp->equals;
                                const u8 left = condp->left;
                                const u8 right = condp->right;
+
                                if (!tomoyo_flush(head))
                                        return false;
                                condp++;
@@ -1262,8 +1294,7 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                case TOMOYO_ARGV_ENTRY:
                                        tomoyo_io_printf(head,
                                                         "exec.argv[%lu]%s=\"",
-                                                        argv->index, argv->
-                                                        is_not ? "!" : "");
+                                                        argv->index, argv->is_not ? "!" : "");
                                        tomoyo_set_string(head,
                                                          argv->value->name);
                                        tomoyo_set_string(head, "\"");
@@ -1274,12 +1305,10 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                                          "exec.envp[\"");
                                        tomoyo_set_string(head,
                                                          envp->name->name);
-                                       tomoyo_io_printf(head, "\"]%s=", envp->
-                                                        is_not ? "!" : "");
+                                       tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
                                        if (envp->value) {
                                                tomoyo_set_string(head, "\"");
-                                               tomoyo_set_string(head, envp->
-                                                                 value->name);
+                                               tomoyo_set_string(head, envp->value->name);
                                                tomoyo_set_string(head, "\"");
                                        } else {
                                                tomoyo_set_string(head,
@@ -1375,6 +1404,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                struct tomoyo_path_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u16 perm = ptr->perm;
+
                for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
@@ -1395,6 +1425,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
        } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
                struct tomoyo_task_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
+
                tomoyo_set_group(head, "task ");
                tomoyo_set_string(head, "manual_domain_transition ");
                tomoyo_set_string(head, ptr->domainname->name);
@@ -1404,6 +1435,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                struct tomoyo_path2_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;
+
                for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
@@ -1424,6 +1456,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                struct tomoyo_path_number_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;
+
                for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
@@ -1444,6 +1477,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                struct tomoyo_mkdev_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;
+
                for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
@@ -1490,6 +1524,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                                          ->name);
                } else {
                        char buf[128];
+
                        tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
                        tomoyo_io_printf(head, "%s", buf);
                }
@@ -1519,6 +1554,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
        } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
                struct tomoyo_mount_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
+
                tomoyo_set_group(head, "file mount");
                tomoyo_print_name_union(head, &ptr->dev_name);
                tomoyo_print_name_union(head, &ptr->dir_name);
@@ -1562,6 +1598,7 @@ static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_info *ptr =
                        list_entry(head->r.acl, typeof(*ptr), list);
+
                if (!tomoyo_print_entry(head, ptr))
                        return false;
        }
@@ -1583,8 +1620,9 @@ static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
        list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
                struct tomoyo_domain_info *domain =
                        list_entry(head->r.domain, typeof(*domain), list);
+               u8 i;
+
                switch (head->r.step) {
-                       u8 i;
                case 0:
                        if (domain->is_deleted &&
                            !head->r.print_this_domain_only)
@@ -1594,22 +1632,33 @@ static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
                        tomoyo_set_lf(head);
                        tomoyo_io_printf(head, "use_profile %u\n",
                                         domain->profile);
-                       tomoyo_io_printf(head, "use_group %u\n",
-                                        domain->group);
                        for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
                                if (domain->flags[i])
                                        tomoyo_set_string(head, tomoyo_dif[i]);
+                       head->r.index = 0;
                        head->r.step++;
-                       tomoyo_set_lf(head);
                        /* fall through */
                case 1:
+                       while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
+                               i = head->r.index++;
+                               if (!test_bit(i, domain->group))
+                                       continue;
+                               tomoyo_io_printf(head, "use_group %u\n", i);
+                               if (!tomoyo_flush(head))
+                                       return;
+                       }
+                       head->r.index = 0;
+                       head->r.step++;
+                       tomoyo_set_lf(head);
+                       /* fall through */
+               case 2:
                        if (!tomoyo_read_domain2(head, &domain->acl_info_list))
                                return;
                        head->r.step++;
                        if (!tomoyo_set_lf(head))
                                return;
                        /* fall through */
-               case 2:
+               case 3:
                        head->r.step = 0;
                        if (head->r.print_this_domain_only)
                                goto done;
@@ -1668,7 +1717,7 @@ static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
        else
                p = find_task_by_vpid(pid);
        if (p)
-               domain = tomoyo_real_domain(p);
+               domain = tomoyo_task(p)->domain_info;
        rcu_read_unlock();
        if (!domain)
                return;
@@ -1711,6 +1760,7 @@ static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
                .data = head->write_buf,
        };
        u8 i;
+
        if (tomoyo_str_starts(&param.data, "aggregator "))
                return tomoyo_write_aggregator(&param);
        for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
@@ -1722,6 +1772,7 @@ static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
        if (tomoyo_str_starts(&param.data, "acl_group ")) {
                unsigned int group;
                char *data;
+
                group = simple_strtoul(param.data, &data, 10);
                if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
                        return tomoyo_write_domain2
@@ -1746,12 +1797,15 @@ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->group_list[idx];
+
        list_for_each_cookie(head->r.group, list) {
                struct tomoyo_group *group =
                        list_entry(head->r.group, typeof(*group), head.list);
+
                list_for_each_cookie(head->r.acl, &group->member_list) {
                        struct tomoyo_acl_head *ptr =
                                list_entry(head->r.acl, typeof(*ptr), list);
+
                        if (ptr->is_deleted)
                                continue;
                        if (!tomoyo_flush(head))
@@ -1771,10 +1825,10 @@ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
                                                           head)->number);
                        } else if (idx == TOMOYO_ADDRESS_GROUP) {
                                char buffer[128];
-
                                struct tomoyo_address_group *member =
                                        container_of(ptr, typeof(*member),
                                                     head);
+
                                tomoyo_print_ip(buffer, sizeof(buffer),
                                                &member->address);
                                tomoyo_io_printf(head, " %s", buffer);
@@ -1802,6 +1856,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->policy_list[idx];
+
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_head *acl =
                        container_of(head->r.acl, typeof(*acl), list);
@@ -1814,6 +1869,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
                        {
                                struct tomoyo_transition_control *ptr =
                                        container_of(acl, typeof(*ptr), head);
+
                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, tomoyo_transition_type
                                                  [ptr->type]);
@@ -1829,6 +1885,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
                        {
                                struct tomoyo_aggregator *ptr =
                                        container_of(acl, typeof(*ptr), head);
+
                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, "aggregator ");
                                tomoyo_set_string(head,
@@ -1858,6 +1915,7 @@ static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
 {
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
+
        if (head->r.eof)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY &&
@@ -1921,6 +1979,7 @@ static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);
 static int tomoyo_truncate(char *str)
 {
        char *start = str;
+
        while (*(unsigned char *) str > (unsigned char) ' ')
                str++;
        *str = '\0';
@@ -1943,6 +2002,7 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
        char *symlink = NULL;
        char *cp = strchr(header, '\n');
        int len;
+
        if (!cp)
                return;
        cp = strchr(cp + 1, '\n');
@@ -2002,6 +2062,7 @@ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
        static unsigned int tomoyo_serial;
        struct tomoyo_query entry = { };
        bool quota_exceeded = false;
+
        va_start(args, fmt);
        len = vsnprintf((char *) &len, 1, fmt, args) + 1;
        va_end(args);
@@ -2063,8 +2124,7 @@ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
                    (tomoyo_answer_wait, entry.answer ||
                     !atomic_read(&tomoyo_query_observers), HZ))
                        break;
-               else
-                       entry.timer++;
+               entry.timer++;
        }
        spin_lock(&tomoyo_query_list_lock);
        list_del(&entry.list);
@@ -2100,6 +2160,7 @@ static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
 {
        struct tomoyo_query *ptr;
        struct tomoyo_domain_info *domain = NULL;
+
        spin_lock(&tomoyo_query_list_lock);
        list_for_each_entry(ptr, &tomoyo_query_list, list) {
                if (ptr->serial != serial)
@@ -2142,15 +2203,15 @@ static void tomoyo_read_query(struct tomoyo_io_buffer *head)
        unsigned int pos = 0;
        size_t len = 0;
        char *buf;
+
        if (head->r.w_pos)
                return;
-       if (head->read_buf) {
-               kfree(head->read_buf);
-               head->read_buf = NULL;
-       }
+       kfree(head->read_buf);
+       head->read_buf = NULL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
+
                if (pos++ != head->r.query_index)
                        continue;
                len = ptr->query_len;
@@ -2168,6 +2229,7 @@ static void tomoyo_read_query(struct tomoyo_io_buffer *head)
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
+
                if (pos++ != head->r.query_index)
                        continue;
                /*
@@ -2202,9 +2264,11 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
        struct list_head *tmp;
        unsigned int serial;
        unsigned int answer;
+
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
+
                ptr->timer = 0;
        }
        spin_unlock(&tomoyo_query_list_lock);
@@ -2213,6 +2277,7 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
+
                if (ptr->serial != serial)
                        continue;
                ptr->answer = answer;
@@ -2235,7 +2300,7 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
 static void tomoyo_read_version(struct tomoyo_io_buffer *head)
 {
        if (!head->r.eof) {
-               tomoyo_io_printf(head, "2.5.0");
+               tomoyo_io_printf(head, "2.6.0");
                head->r.eof = true;
        }
 }
@@ -2287,6 +2352,7 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
 {
        u8 i;
        unsigned int total = 0;
+
        if (head->r.eof)
                return;
        for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
@@ -2295,9 +2361,9 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
                                 tomoyo_stat_updated[i]);
                if (tomoyo_stat_modified[i]) {
                        struct tomoyo_time stamp;
+
                        tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
-                       tomoyo_io_printf(head, " (Last: %04u/%02u/%02u "
-                                        "%02u:%02u:%02u)",
+                       tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
                                         stamp.year, stamp.month, stamp.day,
                                         stamp.hour, stamp.min, stamp.sec);
                }
@@ -2305,6 +2371,7 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
        }
        for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
                unsigned int used = tomoyo_memory_used[i];
+
                total += used;
                tomoyo_io_printf(head, "Memory used by %-22s %10u",
                                 tomoyo_memory_headers[i], used);
@@ -2329,6 +2396,7 @@ static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
 {
        char *data = head->write_buf;
        u8 i;
+
        if (tomoyo_str_starts(&data, "Memory used by "))
                for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
                        if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
@@ -2457,6 +2525,7 @@ int tomoyo_open_control(const u8 type, struct file *file)
 __poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
 {
        struct tomoyo_io_buffer *head = file->private_data;
+
        if (head->poll)
                return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
        return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
@@ -2472,6 +2541,7 @@ __poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
 static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
 {
        struct list_head *ns;
+
        if (head->type != TOMOYO_EXCEPTIONPOLICY &&
            head->type != TOMOYO_PROFILE)
                return;
@@ -2517,7 +2587,7 @@ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
        int idx;
 
        if (!head->read)
-               return -ENOSYS;
+               return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        head->read_user_buf = buffer;
@@ -2557,6 +2627,7 @@ static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
            head->type == TOMOYO_PROFILE) {
                if (*line == '<') {
                        char *cp = strchr(line, ' ');
+
                        if (cp) {
                                *cp++ = '\0';
                                head->w.ns = tomoyo_assign_namespace(line);
@@ -2589,8 +2660,9 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
        size_t avail_len = buffer_len;
        char *cp0 = head->write_buf;
        int idx;
+
        if (!head->write)
-               return -ENOSYS;
+               return -EINVAL;
        if (!access_ok(buffer, buffer_len))
                return -EFAULT;
        if (mutex_lock_interruptible(&head->io_sem))
@@ -2600,9 +2672,11 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
        /* Read a line and dispatch it to the policy handler. */
        while (avail_len > 0) {
                char c;
+
                if (head->w.avail >= head->writebuf_size - 1) {
                        const int len = head->writebuf_size * 2;
                        char *cp = kzalloc(len, GFP_NOFS);
+
                        if (!cp) {
                                error = -ENOMEM;
                                break;
@@ -2701,30 +2775,32 @@ void tomoyo_check_profile(void)
 {
        struct tomoyo_domain_info *domain;
        const int idx = tomoyo_read_lock();
+
        tomoyo_policy_loaded = true;
-       printk(KERN_INFO "TOMOYO: 2.5.0\n");
+       pr_info("TOMOYO: 2.6.0\n");
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) {
                const u8 profile = domain->profile;
-               const struct tomoyo_policy_namespace *ns = domain->ns;
-               if (ns->profile_version != 20110903)
-                       printk(KERN_ERR
-                              "Profile version %u is not supported.\n",
+               struct tomoyo_policy_namespace *ns = domain->ns;
+
+               if (ns->profile_version == 20110903) {
+                       pr_info_once("Converting profile version from %u to %u.\n",
+                                    20110903, 20150505);
+                       ns->profile_version = 20150505;
+               }
+               if (ns->profile_version != 20150505)
+                       pr_err("Profile version %u is not supported.\n",
                               ns->profile_version);
                else if (!ns->profile_ptr[profile])
-                       printk(KERN_ERR
-                              "Profile %u (used by '%s') is not defined.\n",
+                       pr_err("Profile %u (used by '%s') is not defined.\n",
                               profile, domain->domainname->name);
                else
                        continue;
-               printk(KERN_ERR
-                      "Userland tools for TOMOYO 2.5 must be installed and "
-                      "policy must be initialized.\n");
-               printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.5/ "
-                      "for more information.\n");
+               pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
+               pr_err("Please see https://tomoyo.osdn.jp/2.6/ for more information.\n");
                panic("STOP!");
        }
        tomoyo_read_unlock(idx);
-       printk(KERN_INFO "Mandatory Access Control activated.\n");
+       pr_info("Mandatory Access Control activated.\n");
 }
 
 /**
@@ -2743,9 +2819,11 @@ void __init tomoyo_load_builtin_policy(void)
 #include "builtin-policy.h"
        u8 i;
        const int idx = tomoyo_read_lock();
+
        for (i = 0; i < 5; i++) {
                struct tomoyo_io_buffer head = { };
                char *start = "";
+
                switch (i) {
                case 0:
                        start = tomoyo_builtin_profile;
@@ -2775,6 +2853,7 @@ void __init tomoyo_load_builtin_policy(void)
                }
                while (1) {
                        char *end = strchr(start, '\n');
+
                        if (!end)
                                break;
                        *end = '\0';
index 539bcdd30bb8deb22f9cd27e45258fe5e4a3d4a1..050473df5809f13f467c4cdebffd981a773948a9 100644 (file)
@@ -10,6 +10,8 @@
 #ifndef _SECURITY_TOMOYO_COMMON_H
 #define _SECURITY_TOMOYO_COMMON_H
 
+#define pr_fmt(fmt) fmt
+
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/mm.h>
@@ -29,6 +31,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/un.h>
+#include <linux/lsm_hooks.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
 #include <net/ip.h>
@@ -681,11 +684,12 @@ struct tomoyo_domain_info {
        const struct tomoyo_path_info *domainname;
        /* Namespace for this domain. Never NULL. */
        struct tomoyo_policy_namespace *ns;
+       /* Group numbers to use.   */
+       unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
        u8 profile;        /* Profile number to use. */
-       u8 group;          /* Group number to use.   */
        bool is_deleted;   /* Delete flag.           */
        bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
-       atomic_t users; /* Number of referring credentials. */
+       atomic_t users; /* Number of referring tasks. */
 };
 
 /*
@@ -787,9 +791,9 @@ struct tomoyo_acl_param {
  * interfaces.
  */
 struct tomoyo_io_buffer {
-       void (*read) (struct tomoyo_io_buffer *);
-       int (*write) (struct tomoyo_io_buffer *);
-       __poll_t (*poll) (struct file *file, poll_table *wait);
+       void (*read)(struct tomoyo_io_buffer *head);
+       int (*write)(struct tomoyo_io_buffer *head);
+       __poll_t (*poll)(struct file *file, poll_table *wait);
        /* Exclusive lock for this structure.   */
        struct mutex io_sem;
        char __user *read_user_buf;
@@ -906,12 +910,18 @@ struct tomoyo_policy_namespace {
        struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
        /* List for connecting to tomoyo_namespace_list list. */
        struct list_head namespace_list;
-       /* Profile version. Currently only 20110903 is defined. */
+       /* Profile version. Currently only 20150505 is defined. */
        unsigned int profile_version;
        /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
        const char *name;
 };
 
+/* Structure for "struct task_struct"->security. */
+struct tomoyo_task {
+       struct tomoyo_domain_info *domain_info;
+       struct tomoyo_domain_info *old_domain_info;
+};
+
 /********** Function prototypes. **********/
 
 bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
@@ -1020,6 +1030,7 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
 struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
 struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit);
+struct tomoyo_domain_info *tomoyo_domain(void);
 struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
 struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
                                      const u8 idx);
@@ -1034,8 +1045,8 @@ void *tomoyo_commit_ok(void *data, const unsigned int size);
 void __init tomoyo_load_builtin_policy(void);
 void __init tomoyo_mm_init(void);
 void tomoyo_check_acl(struct tomoyo_request_info *r,
-                     bool (*check_entry) (struct tomoyo_request_info *,
-                                          const struct tomoyo_acl_info *));
+                     bool (*check_entry)(struct tomoyo_request_info *,
+                                         const struct tomoyo_acl_info *));
 void tomoyo_check_profile(void);
 void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
 void tomoyo_del_condition(struct list_head *element);
@@ -1062,6 +1073,7 @@ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
 /********** External variable definitions. **********/
 
 extern bool tomoyo_policy_loaded;
+extern int tomoyo_enabled;
 extern const char * const tomoyo_condition_keyword
 [TOMOYO_MAX_CONDITION_KEYWORD];
 extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
@@ -1085,6 +1097,7 @@ extern struct tomoyo_domain_info tomoyo_kernel_domain;
 extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
 extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
 extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
+extern struct lsm_blob_sizes tomoyo_blob_sizes;
 
 /********** Inlined functions. **********/
 
@@ -1121,6 +1134,7 @@ static inline void tomoyo_read_unlock(int idx)
 static inline pid_t tomoyo_sys_getppid(void)
 {
        pid_t pid;
+
        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
@@ -1197,26 +1211,15 @@ static inline void tomoyo_put_group(struct tomoyo_group *group)
 }
 
 /**
- * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
- *
- * Returns pointer to "struct tomoyo_domain_info" for current thread.
- */
-static inline struct tomoyo_domain_info *tomoyo_domain(void)
-{
-       return current_cred()->security;
-}
-
-/**
- * tomoyo_real_domain - Get "struct tomoyo_domain_info" for specified thread.
+ * tomoyo_task - Get "struct tomoyo_task" for specified thread.
  *
- * @task: Pointer to "struct task_struct".
+ * @task - Pointer to "struct task_struct".
  *
- * Returns pointer to "struct tomoyo_security" for specified thread.
+ * Returns pointer to "struct tomoyo_task" for specified thread.
  */
-static inline struct tomoyo_domain_info *tomoyo_real_domain(struct task_struct
-                                                           *task)
+static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
 {
-       return task_cred_xxx(task, security);
+       return task->security + tomoyo_blob_sizes.lbs_task;
 }
 
 /**
index 8d0e1b9c9c573b4fac3f8ca78b2c3e715a614eac..8f6d57c15df6ac42c91f76387317c7fbc348d2c5 100644 (file)
@@ -28,9 +28,11 @@ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr,
 {
        int i;
        struct tomoyo_path_info arg;
+
        arg.name = arg_ptr;
        for (i = 0; i < argc; argv++, checked++, i++) {
                bool result;
+
                if (index != argv->index)
                        continue;
                *checked = 1;
@@ -62,12 +64,14 @@ static bool tomoyo_envp(const char *env_name, const char *env_value,
        int i;
        struct tomoyo_path_info name;
        struct tomoyo_path_info value;
+
        name.name = env_name;
        tomoyo_fill_path_info(&name);
        value.name = env_value;
        tomoyo_fill_path_info(&value);
        for (i = 0; i < envc; envp++, checked++, i++) {
                bool result;
+
                if (!tomoyo_path_matches_pattern(&name, envp->name))
                        continue;
                *checked = 1;
@@ -113,6 +117,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
        bool result = true;
        u8 local_checked[32];
        u8 *checked;
+
        if (argc + envc <= sizeof(local_checked)) {
                checked = local_checked;
                memset(local_checked, 0, sizeof(local_checked));
@@ -131,6 +136,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
                        /* Read. */
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];
+
                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
@@ -160,6 +166,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
                                argv_count--;
                        } else if (envp_count) {
                                char *cp = strchr(arg_ptr, '=');
+
                                if (cp) {
                                        *cp = '\0';
                                        if (!tomoyo_envp(arg_ptr, cp + 1,
@@ -182,6 +189,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
 out:
        if (result) {
                int i;
+
                /* Check not-yet-checked entries. */
                for (i = 0; i < argc; i++) {
                        if (checked[i])
@@ -229,6 +237,7 @@ static bool tomoyo_scan_exec_realpath(struct file *file,
 {
        bool result;
        struct tomoyo_path_info exe;
+
        if (!file)
                return false;
        exe.name = tomoyo_realpath_from_path(&file->f_path);
@@ -250,6 +259,7 @@ static bool tomoyo_scan_exec_realpath(struct file *file,
 static const struct tomoyo_path_info *tomoyo_get_dqword(char *start)
 {
        char *cp = start + strlen(start) - 1;
+
        if (cp == start || *start++ != '"' || *cp != '"')
                return NULL;
        *cp = '\0';
@@ -270,6 +280,7 @@ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param,
                                           struct tomoyo_name_union *ptr)
 {
        char *filename = param->data;
+
        if (*filename == '@')
                return tomoyo_parse_name_union(param, ptr);
        ptr->filename = tomoyo_get_dqword(filename);
@@ -310,6 +321,7 @@ static bool tomoyo_parse_envp(char *left, char *right,
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        char *cp = left + strlen(left) - 1;
+
        if (*cp-- != ']' || *cp != '"')
                goto out;
        *cp = '\0';
@@ -364,6 +376,7 @@ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a,
 static u8 tomoyo_condition_type(const char *word)
 {
        u8 i;
+
        for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) {
                if (!strcmp(word, tomoyo_condition_keyword[i]))
                        break;
@@ -395,6 +408,7 @@ static struct tomoyo_condition *tomoyo_commit_condition
 {
        struct tomoyo_condition *ptr;
        bool found = false;
+
        if (mutex_lock_interruptible(&tomoyo_policy_lock)) {
                dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
                ptr = NULL;
@@ -442,12 +456,14 @@ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param,
 {
        char * const pos = param->data;
        bool flag;
+
        if (*pos == '<') {
                e->transit = tomoyo_get_domainname(param);
                goto done;
        }
        {
                char *cp = strchr(pos, ' ');
+
                if (cp)
                        *cp = '\0';
                flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") ||
@@ -489,6 +505,7 @@ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param)
                tomoyo_get_transit_preference(param, &e);
        char * const end_of_string = start_of_string + strlen(start_of_string);
        char *pos;
+
 rerun:
        pos = start_of_string;
        while (1) {
@@ -498,6 +515,7 @@ rerun:
                char *cp;
                char *right_word;
                bool is_not;
+
                if (!*left_word)
                        break;
                /*
@@ -622,8 +640,8 @@ rerun:
                }
 store_value:
                if (!condp) {
-                       dprintk(KERN_WARNING "%u: dry_run left=%u right=%u "
-                               "match=%u\n", __LINE__, left, right, !is_not);
+                       dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n",
+                               __LINE__, left, right, !is_not);
                        continue;
                }
                condp->left = left;
@@ -660,6 +678,7 @@ store_value:
        envp = (struct tomoyo_envp *) (argv + e.argc);
        {
                bool flag = false;
+
                for (pos = start_of_string; pos < end_of_string; pos++) {
                        if (*pos)
                                continue;
@@ -698,6 +717,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
 
        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct inode *inode;
+
                switch (i) {
                case TOMOYO_PATH1:
                        dentry = obj->path1.dentry;
@@ -718,6 +738,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
                inode = d_backing_inode(dentry);
                if (inode) {
                        struct tomoyo_mini_stat *stat = &obj->stat[i];
+
                        stat->uid  = inode->i_uid;
                        stat->gid  = inode->i_gid;
                        stat->ino  = inode->i_ino;
@@ -726,8 +747,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
                        stat->rdev = inode->i_rdev;
                        obj->stat_valid[i] = true;
                }
-               if (i & 1) /* i == TOMOYO_PATH1_PARENT ||
-                             i == TOMOYO_PATH2_PARENT */
+               if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */
                        dput(dentry);
        }
 }
@@ -758,6 +778,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
        u16 argc;
        u16 envc;
        struct linux_binprm *bprm = NULL;
+
        if (!cond)
                return true;
        condc = cond->condc;
@@ -780,6 +801,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                const u8 right = condp->right;
                bool is_bitop[2] = { false, false };
                u8 j;
+
                condp++;
                /* Check argv[] and envp[] later. */
                if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY)
@@ -787,10 +809,11 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                /* Check string expressions. */
                if (right == TOMOYO_NAME_UNION) {
                        const struct tomoyo_name_union *ptr = names_p++;
+                       struct tomoyo_path_info *symlink;
+                       struct tomoyo_execve *ee;
+                       struct file *file;
+
                        switch (left) {
-                               struct tomoyo_path_info *symlink;
-                               struct tomoyo_execve *ee;
-                               struct file *file;
                        case TOMOYO_SYMLINK_TARGET:
                                symlink = obj ? obj->symlink_target : NULL;
                                if (!symlink ||
@@ -812,6 +835,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                for (j = 0; j < 2; j++) {
                        const u8 index = j ? right : left;
                        unsigned long value = 0;
+
                        switch (index) {
                        case TOMOYO_TASK_UID:
                                value = from_kuid(&init_user_ns, current_uid());
@@ -874,31 +898,31 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                                value = S_ISVTX;
                                break;
                        case TOMOYO_MODE_OWNER_READ:
-                               value = S_IRUSR;
+                               value = 0400;
                                break;
                        case TOMOYO_MODE_OWNER_WRITE:
-                               value = S_IWUSR;
+                               value = 0200;
                                break;
                        case TOMOYO_MODE_OWNER_EXECUTE:
-                               value = S_IXUSR;
+                               value = 0100;
                                break;
                        case TOMOYO_MODE_GROUP_READ:
-                               value = S_IRGRP;
+                               value = 0040;
                                break;
                        case TOMOYO_MODE_GROUP_WRITE:
-                               value = S_IWGRP;
+                               value = 0020;
                                break;
                        case TOMOYO_MODE_GROUP_EXECUTE:
-                               value = S_IXGRP;
+                               value = 0010;
                                break;
                        case TOMOYO_MODE_OTHERS_READ:
-                               value = S_IROTH;
+                               value = 0004;
                                break;
                        case TOMOYO_MODE_OTHERS_WRITE:
-                               value = S_IWOTH;
+                               value = 0002;
                                break;
                        case TOMOYO_MODE_OTHERS_EXECUTE:
-                               value = S_IXOTH;
+                               value = 0001;
                                break;
                        case TOMOYO_EXEC_ARGC:
                                if (!bprm)
@@ -923,6 +947,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                                {
                                        u8 stat_index;
                                        struct tomoyo_mini_stat *stat;
+
                                        switch (index) {
                                        case TOMOYO_PATH1_UID:
                                        case TOMOYO_PATH1_GID:
@@ -1036,12 +1061,14 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
                if (left == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;
+
                        min_v[0] = ptr->values[0];
                        max_v[0] = ptr->values[1];
                }
                if (right == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;
+
                        if (ptr->group) {
                                if (tomoyo_number_matches_group(min_v[0],
                                                                max_v[0],
index f6758dad981f7f32d8897f4f2c790ea69a168db0..8526a0a74023855bbf383e56528e7ebf8107a8a9 100644 (file)
@@ -30,10 +30,10 @@ struct tomoyo_domain_info tomoyo_kernel_domain;
  */
 int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
-                        bool (*check_duplicate) (const struct tomoyo_acl_head
-                                                 *,
-                                                 const struct tomoyo_acl_head
-                                                 *))
+                        bool (*check_duplicate)(const struct tomoyo_acl_head
+                                                *,
+                                                const struct tomoyo_acl_head
+                                                *))
 {
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_head *entry;
@@ -90,13 +90,13 @@ static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
  */
 int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
-                        bool (*check_duplicate) (const struct tomoyo_acl_info
-                                                 *,
-                                                 const struct tomoyo_acl_info
-                                                 *),
-                        bool (*merge_duplicate) (struct tomoyo_acl_info *,
-                                                 struct tomoyo_acl_info *,
-                                                 const bool))
+                        bool (*check_duplicate)(const struct tomoyo_acl_info
+                                                *,
+                                                const struct tomoyo_acl_info
+                                                *),
+                        bool (*merge_duplicate)(struct tomoyo_acl_info *,
+                                                struct tomoyo_acl_info *,
+                                                const bool))
 {
        const bool is_delete = param->is_delete;
        int error = is_delete ? -ENOENT : -ENOMEM;
@@ -157,13 +157,13 @@ out:
  * Caller holds tomoyo_read_lock().
  */
 void tomoyo_check_acl(struct tomoyo_request_info *r,
-                     bool (*check_entry) (struct tomoyo_request_info *,
-                                          const struct tomoyo_acl_info *))
+                     bool (*check_entry)(struct tomoyo_request_info *,
+                                         const struct tomoyo_acl_info *))
 {
        const struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;
-       bool retried = false;
        const struct list_head *list = &domain->acl_info_list;
+       u16 i = 0;
 
 retry:
        list_for_each_entry_rcu(ptr, list, list) {
@@ -177,9 +177,10 @@ retry:
                r->granted = true;
                return;
        }
-       if (!retried) {
-               retried = true;
-               list = &domain->ns->acl_group[domain->group];
+       for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
+               if (!test_bit(i, domain->group))
+                       continue;
+               list = &domain->ns->acl_group[i++];
                goto retry;
        }
        r->granted = false;
@@ -198,6 +199,7 @@ LIST_HEAD(tomoyo_domain_list);
 static const char *tomoyo_last_word(const char *name)
 {
        const char *cp = strrchr(name, ' ');
+
        if (cp)
                return cp + 1;
        return name;
@@ -220,6 +222,7 @@ static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
        const struct tomoyo_transition_control *p2 = container_of(b,
                                                                  typeof(*p2),
                                                                  head);
+
        return p1->type == p2->type && p1->is_last_name == p2->is_last_name
                && p1->domainname == p2->domainname
                && p1->program == p2->program;
@@ -240,6 +243,7 @@ int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        char *program = param->data;
        char *domainname = strstr(program, " from ");
+
        if (domainname) {
                *domainname = '\0';
                domainname += 6;
@@ -293,6 +297,7 @@ static inline bool tomoyo_scan_transition
  const enum tomoyo_transition_type type)
 {
        const struct tomoyo_transition_control *ptr;
+
        list_for_each_entry_rcu(ptr, list, head.list) {
                if (ptr->head.is_deleted || ptr->type != type)
                        continue;
@@ -338,9 +343,11 @@ static enum tomoyo_transition_type tomoyo_transition_type
 {
        const char *last_name = tomoyo_last_word(domainname->name);
        enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;
+
        while (type < TOMOYO_MAX_TRANSITION_TYPE) {
                const struct list_head * const list =
                        &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
+
                if (!tomoyo_scan_transition(list, domainname, program,
                                            last_name, type)) {
                        type++;
@@ -375,6 +382,7 @@ static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
                                                          head);
        const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
                                                          head);
+
        return p1->original_name == p2->original_name &&
                p1->aggregated_name == p2->aggregated_name;
 }
@@ -394,6 +402,7 @@ int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        const char *original_name = tomoyo_read_token(param);
        const char *aggregated_name = tomoyo_read_token(param);
+
        if (!tomoyo_correct_word(original_name) ||
            !tomoyo_correct_path(aggregated_name))
                return -EINVAL;
@@ -426,6 +435,7 @@ static struct tomoyo_policy_namespace *tomoyo_find_namespace
 (const char *name, const unsigned int len)
 {
        struct tomoyo_policy_namespace *ns;
+
        list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
                if (strncmp(name, ns->name, len) ||
                    (name[len] && name[len] != ' '))
@@ -451,6 +461,7 @@ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
        struct tomoyo_policy_namespace *entry;
        const char *cp = domainname;
        unsigned int len = 0;
+
        while (*cp && *cp++ != ' ')
                len++;
        ptr = tomoyo_find_namespace(domainname, len);
@@ -466,6 +477,7 @@ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
        ptr = tomoyo_find_namespace(domainname, len);
        if (!ptr && tomoyo_memory_ok(entry)) {
                char *name = (char *) (entry + 1);
+
                ptr = entry;
                memmove(name, domainname, len);
                name[len] = '\0';
@@ -490,6 +502,7 @@ static bool tomoyo_namespace_jump(const char *domainname)
 {
        const char *namespace = tomoyo_current_namespace()->name;
        const int len = strlen(namespace);
+
        return strncmp(domainname, namespace, len) ||
                (domainname[len] && domainname[len] != ' ');
 }
@@ -510,6 +523,7 @@ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
        struct tomoyo_domain_info e = { };
        struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
        bool created = false;
+
        if (entry) {
                if (transit) {
                        /*
@@ -546,8 +560,9 @@ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
         */
        if (transit) {
                const struct tomoyo_domain_info *domain = tomoyo_domain();
+
                e.profile = domain->profile;
-               e.group = domain->group;
+               memcpy(e.group, domain->group, sizeof(e.group));
        }
        e.domainname = tomoyo_get_name(domainname);
        if (!e.domainname)
@@ -569,12 +584,17 @@ out:
        if (entry && transit) {
                if (created) {
                        struct tomoyo_request_info r;
+                       int i;
+
                        tomoyo_init_request_info(&r, entry,
                                                 TOMOYO_MAC_FILE_EXECUTE);
                        r.granted = false;
                        tomoyo_write_log(&r, "use_profile %u\n",
                                         entry->profile);
-                       tomoyo_write_log(&r, "use_group %u\n", entry->group);
+                       for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
+                               if (test_bit(i, entry->group))
+                                       tomoyo_write_log(&r, "use_group %u\n",
+                                                        i);
                        tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                }
        }
@@ -712,6 +732,7 @@ retry:
                struct tomoyo_aggregator *ptr;
                struct list_head *list =
                        &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];
+
                /* Check 'aggregator' directive. */
                candidate = &exename;
                list_for_each_entry_rcu(ptr, list, head.list) {
@@ -747,6 +768,7 @@ retry:
         */
        if (ee->transition) {
                const char *domainname = ee->transition->name;
+
                reject_on_transition_failure = true;
                if (!strcmp(domainname, "keep"))
                        goto force_keep_domain;
@@ -758,6 +780,7 @@ retry:
                        goto force_initialize_domain;
                if (!strcmp(domainname, "parent")) {
                        char *cp;
+
                        strncpy(ee->tmp, old_domain->domainname->name,
                                TOMOYO_EXEC_TMPSIZE - 1);
                        cp = strrchr(ee->tmp, ' ');
@@ -822,8 +845,7 @@ force_jump_domain:
        if (domain)
                retval = 0;
        else if (reject_on_transition_failure) {
-               printk(KERN_WARNING "ERROR: Domain '%s' not ready.\n",
-                      ee->tmp);
+               pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
                retval = -ENOMEM;
        } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
                retval = -ENOMEM;
@@ -834,16 +856,20 @@ force_jump_domain:
                        ee->r.granted = false;
                        tomoyo_write_log(&ee->r, "%s", tomoyo_dif
                                         [TOMOYO_DIF_TRANSITION_FAILED]);
-                       printk(KERN_WARNING
-                              "ERROR: Domain '%s' not defined.\n", ee->tmp);
+                       pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
                }
        }
  out:
        if (!domain)
                domain = old_domain;
        /* Update reference count on "struct tomoyo_domain_info". */
-       atomic_inc(&domain->users);
-       bprm->cred->security = domain;
+       {
+               struct tomoyo_task *s = tomoyo_task(current);
+
+               s->old_domain_info = s->domain_info;
+               s->domain_info = domain;
+               atomic_inc(&domain->users);
+       }
        kfree(exename.name);
        if (!retval) {
                ee->r.domain = domain;
index 2a374b4da8f5cec172041876e7dee0c986cd45dc..86f7d1b90212a9b0f6bb6ce461d8e95b7a164c32 100644 (file)
@@ -214,6 +214,7 @@ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
        const u8 type = r->param.path_number.operation;
        u8 radix;
        char buffer[64];
+
        switch (type) {
        case TOMOYO_TYPE_CREATE:
        case TOMOYO_TYPE_MKDIR:
@@ -253,6 +254,7 @@ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);
+
        if (acl->perm & (1 << r->param.path.operation)) {
                r->param.path.matched_path =
                        tomoyo_compare_name_union(r->param.path.filename,
@@ -275,6 +277,7 @@ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_path_number_acl *acl =
                container_of(ptr, typeof(*acl), head);
+
        return (acl->perm & (1 << r->param.path_number.operation)) &&
                tomoyo_compare_number_union(r->param.path_number.number,
                                            &acl->number) &&
@@ -295,6 +298,7 @@ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_path2_acl *acl =
                container_of(ptr, typeof(*acl), head);
+
        return (acl->perm & (1 << r->param.path2.operation)) &&
                tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
                && tomoyo_compare_name_union(r->param.path2.filename2,
@@ -314,6 +318,7 @@ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_mkdev_acl *acl =
                container_of(ptr, typeof(*acl), head);
+
        return (acl->perm & (1 << r->param.mkdev.operation)) &&
                tomoyo_compare_number_union(r->param.mkdev.mode,
                                            &acl->mode) &&
@@ -338,6 +343,7 @@ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
 {
        const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);
+
        return tomoyo_same_name_union(&p1->name, &p2->name);
 }
 
@@ -358,6 +364,7 @@ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
                ->perm;
        u16 perm = *a_perm;
        const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;
+
        if (is_delete)
                perm &= ~b_perm;
        else
@@ -384,6 +391,7 @@ static int tomoyo_update_path_acl(const u16 perm,
                .perm = perm
        };
        int error;
+
        if (!tomoyo_parse_name_union(param, &e.name))
                error = -EINVAL;
        else
@@ -407,6 +415,7 @@ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
 {
        const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);
+
        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->mode, &p2->mode) &&
                tomoyo_same_number_union(&p1->major, &p2->major) &&
@@ -431,6 +440,7 @@ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
        u8 perm = *a_perm;
        const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
                ->perm;
+
        if (is_delete)
                perm &= ~b_perm;
        else
@@ -457,6 +467,7 @@ static int tomoyo_update_mkdev_acl(const u8 perm,
                .perm = perm
        };
        int error;
+
        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.mode) ||
            !tomoyo_parse_number_union(param, &e.major) ||
@@ -486,6 +497,7 @@ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
 {
        const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);
+
        return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
                tomoyo_same_name_union(&p1->name2, &p2->name2);
 }
@@ -507,6 +519,7 @@ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
                ->perm;
        u8 perm = *a_perm;
        const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;
+
        if (is_delete)
                perm &= ~b_perm;
        else
@@ -533,6 +546,7 @@ static int tomoyo_update_path2_acl(const u8 perm,
                .perm = perm
        };
        int error;
+
        if (!tomoyo_parse_name_union(param, &e.name1) ||
            !tomoyo_parse_name_union(param, &e.name2))
                error = -EINVAL;
@@ -621,6 +635,7 @@ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
                                                               head);
        const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
                                                               head);
+
        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->number, &p2->number);
 }
@@ -643,6 +658,7 @@ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
        u8 perm = *a_perm;
        const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
                ->perm;
+
        if (is_delete)
                perm &= ~b_perm;
        else
@@ -667,6 +683,7 @@ static int tomoyo_update_path_number_acl(const u8 perm,
                .perm = perm
        };
        int error;
+
        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.number))
                error = -EINVAL;
@@ -947,6 +964,7 @@ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
 {
        const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);
+
        return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
                tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
                tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
@@ -966,6 +984,7 @@ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
 {
        struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
        int error;
+
        if (!tomoyo_parse_name_union(param, &e.dev_name) ||
            !tomoyo_parse_name_union(param, &e.dir_name) ||
            !tomoyo_parse_name_union(param, &e.fs_type) ||
@@ -995,6 +1014,7 @@ int tomoyo_write_file(struct tomoyo_acl_param *param)
        u16 perm = 0;
        u8 type;
        const char *operation = tomoyo_read_token(param);
+
        for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
                        perm |= 1 << type;
index e22bea811c574b990af47fa583d4fc8a071cb35b..9537832fca18336b9d46af859f7ca37f6e8c593f 100644 (file)
@@ -77,11 +77,13 @@ static bool tomoyo_name_used_by_io_buffer(const char *string)
        spin_lock(&tomoyo_io_buffer_list_lock);
        list_for_each_entry(head, &tomoyo_io_buffer_list, list) {
                int i;
+
                head->users++;
                spin_unlock(&tomoyo_io_buffer_list_lock);
                mutex_lock(&head->io_sem);
                for (i = 0; i < TOMOYO_MAX_IO_READ_QUEUE; i++) {
                        const char *w = head->r.w[i];
+
                        if (w < string || w > string + size)
                                continue;
                        in_use = true;
@@ -108,6 +110,7 @@ static inline void tomoyo_del_transition_control(struct list_head *element)
 {
        struct tomoyo_transition_control *ptr =
                container_of(element, typeof(*ptr), head.list);
+
        tomoyo_put_name(ptr->domainname);
        tomoyo_put_name(ptr->program);
 }
@@ -123,6 +126,7 @@ static inline void tomoyo_del_aggregator(struct list_head *element)
 {
        struct tomoyo_aggregator *ptr =
                container_of(element, typeof(*ptr), head.list);
+
        tomoyo_put_name(ptr->original_name);
        tomoyo_put_name(ptr->aggregated_name);
 }
@@ -138,6 +142,7 @@ static inline void tomoyo_del_manager(struct list_head *element)
 {
        struct tomoyo_manager *ptr =
                container_of(element, typeof(*ptr), head.list);
+
        tomoyo_put_name(ptr->manager);
 }
 
@@ -152,6 +157,7 @@ static void tomoyo_del_acl(struct list_head *element)
 {
        struct tomoyo_acl_info *acl =
                container_of(element, typeof(*acl), list);
+
        tomoyo_put_condition(acl->cond);
        switch (acl->type) {
        case TOMOYO_TYPE_PATH_ACL:
@@ -226,6 +232,7 @@ static void tomoyo_del_acl(struct list_head *element)
                {
                        struct tomoyo_task_acl *entry =
                                container_of(acl, typeof(*entry), head);
+
                        tomoyo_put_name(entry->domainname);
                }
                break;
@@ -247,6 +254,7 @@ static inline void tomoyo_del_domain(struct list_head *element)
                container_of(element, typeof(*domain), list);
        struct tomoyo_acl_info *acl;
        struct tomoyo_acl_info *tmp;
+
        /*
         * Since this domain is referenced from neither
         * "struct tomoyo_io_buffer" nor "struct cred"->security, we can delete
@@ -286,6 +294,7 @@ void tomoyo_del_condition(struct list_head *element)
                = (const struct tomoyo_argv *) (names_p + names_count);
        const struct tomoyo_envp *envp
                = (const struct tomoyo_envp *) (argv + argc);
+
        for (i = 0; i < numbers_count; i++)
                tomoyo_put_number_union(numbers_p++);
        for (i = 0; i < names_count; i++)
@@ -321,6 +330,7 @@ static inline void tomoyo_del_path_group(struct list_head *element)
 {
        struct tomoyo_path_group *member =
                container_of(element, typeof(*member), head.list);
+
        tomoyo_put_name(member->member_name);
 }
 
@@ -335,6 +345,7 @@ static inline void tomoyo_del_group(struct list_head *element)
 {
        struct tomoyo_group *group =
                container_of(element, typeof(*group), head.list);
+
        tomoyo_put_name(group->group_name);
 }
 
@@ -476,6 +487,7 @@ static void tomoyo_collect_member(const enum tomoyo_policy_id id,
 {
        struct tomoyo_acl_head *member;
        struct tomoyo_acl_head *tmp;
+
        list_for_each_entry_safe(member, tmp, member_list, list) {
                if (!member->is_deleted)
                        continue;
@@ -495,6 +507,7 @@ static void tomoyo_collect_acl(struct list_head *list)
 {
        struct tomoyo_acl_info *acl;
        struct tomoyo_acl_info *tmp;
+
        list_for_each_entry_safe(acl, tmp, list, list) {
                if (!acl->is_deleted)
                        continue;
@@ -513,10 +526,12 @@ static void tomoyo_collect_entry(void)
        int i;
        enum tomoyo_policy_id id;
        struct tomoyo_policy_namespace *ns;
+
        mutex_lock(&tomoyo_policy_lock);
        {
                struct tomoyo_domain_info *domain;
                struct tomoyo_domain_info *tmp;
+
                list_for_each_entry_safe(domain, tmp, &tomoyo_domain_list,
                                         list) {
                        tomoyo_collect_acl(&domain->acl_info_list);
@@ -534,6 +549,7 @@ static void tomoyo_collect_entry(void)
        {
                struct tomoyo_shared_acl_head *ptr;
                struct tomoyo_shared_acl_head *tmp;
+
                list_for_each_entry_safe(ptr, tmp, &tomoyo_condition_list,
                                         list) {
                        if (atomic_read(&ptr->users) > 0)
@@ -547,6 +563,7 @@ static void tomoyo_collect_entry(void)
                        struct list_head *list = &ns->group_list[i];
                        struct tomoyo_group *group;
                        struct tomoyo_group *tmp;
+
                        switch (i) {
                        case 0:
                                id = TOMOYO_ID_PATH_GROUP;
@@ -574,6 +591,7 @@ static void tomoyo_collect_entry(void)
                struct list_head *list = &tomoyo_name_list[i];
                struct tomoyo_shared_acl_head *ptr;
                struct tomoyo_shared_acl_head *tmp;
+
                list_for_each_entry_safe(ptr, tmp, list, list) {
                        if (atomic_read(&ptr->users) > 0)
                                continue;
@@ -595,6 +613,7 @@ static int tomoyo_gc_thread(void *unused)
 {
        /* Garbage collector thread is exclusive. */
        static DEFINE_MUTEX(tomoyo_gc_mutex);
+
        if (!mutex_trylock(&tomoyo_gc_mutex))
                goto out;
        tomoyo_collect_entry();
index 21b0cc3a7e1a34ddad1d224826c0edac47ba357d..a37c7dc66e4448e41a498b5005950e6d127833e3 100644 (file)
@@ -75,11 +75,13 @@ int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type)
 {
        struct tomoyo_group *group = tomoyo_get_group(param, type);
        int error = -EINVAL;
+
        if (!group)
                return -ENOMEM;
        param->list = &group->member_list;
        if (type == TOMOYO_PATH_GROUP) {
                struct tomoyo_path_group e = { };
+
                e.member_name = tomoyo_get_name(tomoyo_read_token(param));
                if (!e.member_name) {
                        error = -ENOMEM;
@@ -90,6 +92,7 @@ int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type)
                tomoyo_put_name(e.member_name);
        } else if (type == TOMOYO_NUMBER_GROUP) {
                struct tomoyo_number_group e = { };
+
                if (param->data[0] == '@' ||
                    !tomoyo_parse_number_union(param, &e.number))
                        goto out;
@@ -129,6 +132,7 @@ tomoyo_path_matches_group(const struct tomoyo_path_info *pathname,
                          const struct tomoyo_group *group)
 {
        struct tomoyo_path_group *member;
+
        list_for_each_entry_rcu(member, &group->member_list, head.list) {
                if (member->head.is_deleted)
                        continue;
@@ -156,6 +160,7 @@ bool tomoyo_number_matches_group(const unsigned long min,
 {
        struct tomoyo_number_group *member;
        bool matched = false;
+
        list_for_each_entry_rcu(member, &group->member_list, head.list) {
                if (member->head.is_deleted)
                        continue;
index 81b95165205119fb41c6508044c2729e2c48a2b6..3445ae6fd4794eaf430384ec63622004bc93f8fb 100644 (file)
@@ -37,11 +37,12 @@ __setup("TOMOYO_loader=", tomoyo_loader_setup);
 static bool tomoyo_policy_loader_exists(void)
 {
        struct path path;
+
        if (!tomoyo_loader)
                tomoyo_loader = CONFIG_SECURITY_TOMOYO_POLICY_LOADER;
        if (kern_path(tomoyo_loader, LOOKUP_FOLLOW, &path)) {
-               printk(KERN_INFO "Not activating Mandatory Access Control "
-                      "as %s does not exist.\n", tomoyo_loader);
+               pr_info("Not activating Mandatory Access Control as %s does not exist.\n",
+                       tomoyo_loader);
                return false;
        }
        path_put(&path);
@@ -96,8 +97,7 @@ void tomoyo_load_policy(const char *filename)
        if (!tomoyo_policy_loader_exists())
                return;
        done = true;
-       printk(KERN_INFO "Calling %s to load policy. Please wait.\n",
-              tomoyo_loader);
+       pr_info("Calling %s to load policy. Please wait.\n", tomoyo_loader);
        argv[0] = (char *) tomoyo_loader;
        argv[1] = NULL;
        envp[0] = "HOME=/";
index 12477e0b0a1109f615b0b53c1329f2daeb410452..2e7fcfa923c9199a2ee83ae0bd776e9aecb6ec93 100644 (file)
@@ -19,9 +19,9 @@ void tomoyo_warn_oom(const char *function)
        /* Reduce error messages. */
        static pid_t tomoyo_last_pid;
        const pid_t pid = current->pid;
+
        if (tomoyo_last_pid != pid) {
-               printk(KERN_WARNING "ERROR: Out of memory at %s.\n",
-                      function);
+               pr_warn("ERROR: Out of memory at %s.\n", function);
                tomoyo_last_pid = pid;
        }
        if (!tomoyo_policy_loaded)
@@ -48,6 +48,7 @@ bool tomoyo_memory_ok(void *ptr)
 {
        if (ptr) {
                const size_t s = ksize(ptr);
+
                tomoyo_memory_used[TOMOYO_MEMORY_POLICY] += s;
                if (!tomoyo_memory_quota[TOMOYO_MEMORY_POLICY] ||
                    tomoyo_memory_used[TOMOYO_MEMORY_POLICY] <=
@@ -73,6 +74,7 @@ bool tomoyo_memory_ok(void *ptr)
 void *tomoyo_commit_ok(void *data, const unsigned int size)
 {
        void *ptr = kzalloc(size, GFP_NOFS);
+
        if (tomoyo_memory_ok(ptr)) {
                memmove(ptr, data, size);
                memset(data, 0, size);
@@ -98,6 +100,7 @@ struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
        struct list_head *list;
        const char *group_name = tomoyo_read_token(param);
        bool found = false;
+
        if (!tomoyo_correct_word(group_name) || idx >= TOMOYO_MAX_GROUP)
                return NULL;
        e.group_name = tomoyo_get_name(group_name);
@@ -116,6 +119,7 @@ struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
        }
        if (!found) {
                struct tomoyo_group *entry = tomoyo_commit_ok(&e, sizeof(e));
+
                if (entry) {
                        INIT_LIST_HEAD(&entry->member_list);
                        atomic_set(&entry->head.users, 1);
@@ -191,6 +195,7 @@ struct tomoyo_policy_namespace tomoyo_kernel_namespace;
 void __init tomoyo_mm_init(void)
 {
        int idx;
+
        for (idx = 0; idx < TOMOYO_MAX_HASH; idx++)
                INIT_LIST_HEAD(&tomoyo_name_list[idx]);
        tomoyo_kernel_namespace.name = "<kernel>";
index 7dc7f59b7ddecabc7a70253f1a213a581f943b90..2755971f50dfe025f2ad81f53c88c0ed1da6de0c 100644 (file)
@@ -49,6 +49,7 @@ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_mount_acl *acl =
                container_of(ptr, typeof(*acl), head);
+
        return tomoyo_compare_number_union(r->param.mount.flags,
                                           &acl->flags) &&
                tomoyo_compare_name_union(r->param.mount.type,
@@ -89,6 +90,7 @@ static int tomoyo_mount_acl(struct tomoyo_request_info *r,
        struct tomoyo_path_info rdir;
        int need_dev = 0;
        int error = -ENOMEM;
+
        r->obj = &obj;
 
        /* Get fstype. */
index 6ff8c21e4fff5edfcfb3fed3bd511d37271120f4..85e6e31dd1e5d29c7465c6475e1790a907ba72c2 100644 (file)
@@ -94,11 +94,13 @@ static char *tomoyo_get_absolute_path(const struct path *path, char * const buff
                                      const int buflen)
 {
        char *pos = ERR_PTR(-ENOMEM);
+
        if (buflen >= 256) {
                /* go to whatever namespace root we are under */
                pos = d_absolute_path(path, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(path->dentry);
+
                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
@@ -123,10 +125,12 @@ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
                                    const int buflen)
 {
        char *pos = ERR_PTR(-ENOMEM);
+
        if (buflen >= 256) {
                pos = dentry_path_raw(dentry, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(dentry);
+
                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
@@ -150,12 +154,14 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
 {
        struct super_block *sb = dentry->d_sb;
        char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);
+
        if (IS_ERR(pos))
                return pos;
        /* Convert from $PID to self if $PID is current thread. */
        if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
                char *ep;
                const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
+
                if (*ep == '/' && pid && pid ==
                    task_tgid_nr_ns(current, sb->s_fs_info)) {
                        pos = ep - 5;
@@ -170,6 +176,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                goto prepend_filesystem_name;
        {
                struct inode *inode = d_backing_inode(sb->s_root);
+
                /*
                 * Use filesystem name if filesystem does not support rename()
                 * operation.
@@ -182,6 +189,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                char name[64];
                int name_len;
                const dev_t dev = sb->s_dev;
+
                name[sizeof(name) - 1] = '\0';
                snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
                         MINOR(dev));
@@ -197,6 +205,7 @@ prepend_filesystem_name:
        {
                const char *name = sb->s_type->name;
                const int name_len = strlen(name);
+
                pos -= name_len + 1;
                if (pos < buffer)
                        goto out;
@@ -223,10 +232,10 @@ static char *tomoyo_get_socket_name(const struct path *path, char * const buffer
        struct inode *inode = d_backing_inode(path->dentry);
        struct socket *sock = inode ? SOCKET_I(inode) : NULL;
        struct sock *sk = sock ? sock->sk : NULL;
+
        if (sk) {
-               snprintf(buffer, buflen, "socket:[family=%u:type=%u:"
-                        "protocol=%u]", sk->sk_family, sk->sk_type,
-                        sk->sk_protocol);
+               snprintf(buffer, buflen, "socket:[family=%u:type=%u:protocol=%u]",
+                        sk->sk_family, sk->sk_type, sk->sk_protocol);
        } else {
                snprintf(buffer, buflen, "socket:[unknown]");
        }
@@ -255,12 +264,14 @@ char *tomoyo_realpath_from_path(const struct path *path)
        unsigned int buf_len = PAGE_SIZE / 2;
        struct dentry *dentry = path->dentry;
        struct super_block *sb;
+
        if (!dentry)
                return NULL;
        sb = dentry->d_sb;
        while (1) {
                char *pos;
                struct inode *inode;
+
                buf_len <<= 1;
                kfree(buf);
                buf = kmalloc(buf_len, GFP_NOFS);
@@ -323,6 +334,7 @@ char *tomoyo_realpath_nofollow(const char *pathname)
 
        if (pathname && kern_path(pathname, 0, &path) == 0) {
                char *buf = tomoyo_realpath_from_path(&path);
+
                path_put(&path);
                return buf;
        }
index 1d3d7e7a1f055a187aac486506ca3d13cba7da20..546281c5b233afb4a91968d524bffceeba9f96d4 100644 (file)
@@ -21,6 +21,7 @@ static bool tomoyo_check_task_acl(struct tomoyo_request_info *r,
 {
        const struct tomoyo_task_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);
+
        return !tomoyo_pathcmp(r->param.task.domainname, acl->domainname);
 }
 
@@ -42,6 +43,7 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
 {
        char *data;
        int error;
+
        if (!count || count >= TOMOYO_EXEC_TMPSIZE - 10)
                return -ENOMEM;
        data = memdup_user_nul(buf, count);
@@ -52,6 +54,7 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
                const int idx = tomoyo_read_lock();
                struct tomoyo_path_info name;
                struct tomoyo_request_info r;
+
                name.name = data;
                tomoyo_fill_path_info(&name);
                /* Check "task manual_domain_transition" permission. */
@@ -67,18 +70,14 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
                        if (!new_domain) {
                                error = -ENOENT;
                        } else {
-                               struct cred *cred = prepare_creds();
-                               if (!cred) {
-                                       error = -ENOMEM;
-                               } else {
-                                       struct tomoyo_domain_info *old_domain =
-                                               cred->security;
-                                       cred->security = new_domain;
-                                       atomic_inc(&new_domain->users);
-                                       atomic_dec(&old_domain->users);
-                                       commit_creds(cred);
-                                       error = 0;
-                               }
+                               struct tomoyo_task *s = tomoyo_task(current);
+                               struct tomoyo_domain_info *old_domain =
+                                       s->domain_info;
+
+                               s->domain_info = new_domain;
+                               atomic_inc(&new_domain->users);
+                               atomic_dec(&old_domain->users);
+                               error = 0;
                        }
                }
                tomoyo_read_unlock(idx);
@@ -104,6 +103,7 @@ static ssize_t tomoyo_read_self(struct file *file, char __user *buf,
        const char *domain = tomoyo_domain()->domainname->name;
        loff_t len = strlen(domain);
        loff_t pos = *ppos;
+
        if (pos >= len || !count)
                return 0;
        len -= pos;
@@ -234,10 +234,14 @@ static void __init tomoyo_create_entry(const char *name, const umode_t mode,
  */
 static int __init tomoyo_initerface_init(void)
 {
+       struct tomoyo_domain_info *domain;
        struct dentry *tomoyo_dir;
 
+       if (!tomoyo_enabled)
+               return 0;
+       domain = tomoyo_domain();
        /* Don't create securityfs entries unless registered. */
-       if (current_cred()->security != &tomoyo_kernel_domain)
+       if (domain != &tomoyo_kernel_domain)
                return 0;
 
        tomoyo_dir = securityfs_create_dir("tomoyo", NULL);
index 1b5b5097efd77a5ca12376f5066123125e20824e..716c92ec941adc595c1887995afd897a77d8ab26 100644 (file)
@@ -9,17 +9,19 @@
 #include "common.h"
 
 /**
- * tomoyo_cred_alloc_blank - Target for security_cred_alloc_blank().
+ * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
  *
- * @new: Pointer to "struct cred".
- * @gfp: Memory allocation flags.
- *
- * Returns 0.
+ * Returns pointer to "struct tomoyo_domain_info" for current thread.
  */
-static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp)
+struct tomoyo_domain_info *tomoyo_domain(void)
 {
-       new->security = NULL;
-       return 0;
+       struct tomoyo_task *s = tomoyo_task(current);
+
+       if (s->old_domain_info && !current->in_execve) {
+               atomic_dec(&s->old_domain_info->users);
+               s->old_domain_info = NULL;
+       }
+       return s->domain_info;
 }
 
 /**
@@ -34,42 +36,38 @@ static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp)
 static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
                               gfp_t gfp)
 {
-       struct tomoyo_domain_info *domain = old->security;
-       new->security = domain;
-       if (domain)
-               atomic_inc(&domain->users);
+       /* Restore old_domain_info saved by previous execve() request. */
+       struct tomoyo_task *s = tomoyo_task(current);
+
+       if (s->old_domain_info && !current->in_execve) {
+               atomic_dec(&s->domain_info->users);
+               s->domain_info = s->old_domain_info;
+               s->old_domain_info = NULL;
+       }
        return 0;
 }
 
 /**
- * tomoyo_cred_transfer - Target for security_transfer_creds().
+ * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
  *
- * @new: Pointer to "struct cred".
- * @old: Pointer to "struct cred".
+ * @bprm: Pointer to "struct linux_binprm".
  */
-static void tomoyo_cred_transfer(struct cred *new, const struct cred *old)
+static void tomoyo_bprm_committed_creds(struct linux_binprm *bprm)
 {
-       tomoyo_cred_prepare(new, old, 0);
-}
+       /* Clear old_domain_info saved by execve() request. */
+       struct tomoyo_task *s = tomoyo_task(current);
 
-/**
- * tomoyo_cred_free - Target for security_cred_free().
- *
- * @cred: Pointer to "struct cred".
- */
-static void tomoyo_cred_free(struct cred *cred)
-{
-       struct tomoyo_domain_info *domain = cred->security;
-       if (domain)
-               atomic_dec(&domain->users);
+       atomic_dec(&s->old_domain_info->users);
+       s->old_domain_info = NULL;
 }
 
+#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
 /**
  * tomoyo_bprm_set_creds - Target for security_bprm_set_creds().
  *
  * @bprm: Pointer to "struct linux_binprm".
  *
- * Returns 0 on success, negative value otherwise.
+ * Returns 0.
  */
 static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
 {
@@ -79,29 +77,15 @@ static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
         */
        if (bprm->called_set_creds)
                return 0;
-#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        /*
         * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
         * for the first time.
         */
        if (!tomoyo_policy_loaded)
                tomoyo_load_policy(bprm->filename);
-#endif
-       /*
-        * Release reference to "struct tomoyo_domain_info" stored inside
-        * "bprm->cred->security". New reference to "struct tomoyo_domain_info"
-        * stored inside "bprm->cred->security" will be acquired later inside
-        * tomoyo_find_next_domain().
-        */
-       atomic_dec(&((struct tomoyo_domain_info *)
-                    bprm->cred->security)->users);
-       /*
-        * Tell tomoyo_bprm_check_security() is called for the first time of an
-        * execve operation.
-        */
-       bprm->cred->security = NULL;
        return 0;
 }
+#endif
 
 /**
  * tomoyo_bprm_check_security - Target for security_bprm_check().
@@ -112,23 +96,24 @@ static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
  */
 static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
 {
-       struct tomoyo_domain_info *domain = bprm->cred->security;
+       struct tomoyo_task *s = tomoyo_task(current);
 
        /*
         * Execute permission is checked against pathname passed to do_execve()
         * using current domain.
         */
-       if (!domain) {
+       if (!s->old_domain_info) {
                const int idx = tomoyo_read_lock();
                const int err = tomoyo_find_next_domain(bprm);
+
                tomoyo_read_unlock(idx);
                return err;
        }
        /*
         * Read permission is checked against interpreters using next domain.
         */
-       return tomoyo_check_open_permission(domain, &bprm->file->f_path,
-                                           O_RDONLY);
+       return tomoyo_check_open_permission(s->domain_info,
+                                           &bprm->file->f_path, O_RDONLY);
 }
 
 /**
@@ -167,6 +152,7 @@ static int tomoyo_path_truncate(const struct path *path)
 static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
 {
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
+
        return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
 }
 
@@ -183,6 +169,7 @@ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
                             umode_t mode)
 {
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
+
        return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
                                       mode & S_IALLUGO);
 }
@@ -198,6 +185,7 @@ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
 static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
 {
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
+
        return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
 }
 
@@ -214,6 +202,7 @@ static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
                               const char *old_name)
 {
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
+
        return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
 }
 
@@ -271,6 +260,7 @@ static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_di
 {
        struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };
+
        return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
 }
 
@@ -291,6 +281,7 @@ static int tomoyo_path_rename(const struct path *old_parent,
 {
        struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };
+
        return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
 }
 
@@ -322,11 +313,11 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int tomoyo_file_open(struct file *f)
 {
-       int flags = f->f_flags;
        /* Don't check read permission here if called from do_execve(). */
        if (current->in_execve)
                return 0;
-       return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, flags);
+       return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
+                                           f->f_flags);
 }
 
 /**
@@ -370,6 +361,7 @@ static int tomoyo_path_chmod(const struct path *path, umode_t mode)
 static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
        int error = 0;
+
        if (uid_valid(uid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
                                                from_kuid(&init_user_ns, uid));
@@ -419,6 +411,7 @@ static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
 static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
 {
        struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };
+
        return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
 }
 
@@ -493,16 +486,61 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
        return tomoyo_socket_sendmsg_permission(sock, msg, size);
 }
 
+struct lsm_blob_sizes tomoyo_blob_sizes __lsm_ro_after_init = {
+       .lbs_task = sizeof(struct tomoyo_task),
+};
+
+/**
+ * tomoyo_task_alloc - Target for security_task_alloc().
+ *
+ * @task:  Pointer to "struct task_struct".
+ * @flags: clone() flags.
+ *
+ * Returns 0.
+ */
+static int tomoyo_task_alloc(struct task_struct *task,
+                            unsigned long clone_flags)
+{
+       struct tomoyo_task *old = tomoyo_task(current);
+       struct tomoyo_task *new = tomoyo_task(task);
+
+       new->domain_info = old->domain_info;
+       atomic_inc(&new->domain_info->users);
+       new->old_domain_info = NULL;
+       return 0;
+}
+
+/**
+ * tomoyo_task_free - Target for security_task_free().
+ *
+ * @task: Pointer to "struct task_struct".
+ */
+static void tomoyo_task_free(struct task_struct *task)
+{
+       struct tomoyo_task *s = tomoyo_task(task);
+
+       if (s->domain_info) {
+               atomic_dec(&s->domain_info->users);
+               s->domain_info = NULL;
+       }
+       if (s->old_domain_info) {
+               atomic_dec(&s->old_domain_info->users);
+               s->old_domain_info = NULL;
+       }
+}
+
 /*
  * tomoyo_security_ops is a "struct security_operations" which is used for
  * registering TOMOYO.
  */
 static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
-       LSM_HOOK_INIT(cred_alloc_blank, tomoyo_cred_alloc_blank),
        LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
-       LSM_HOOK_INIT(cred_transfer, tomoyo_cred_transfer),
-       LSM_HOOK_INIT(cred_free, tomoyo_cred_free),
+       LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
+       LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
+       LSM_HOOK_INIT(task_free, tomoyo_task_free),
+#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        LSM_HOOK_INIT(bprm_set_creds, tomoyo_bprm_set_creds),
+#endif
        LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
        LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
        LSM_HOOK_INIT(file_open, tomoyo_file_open),
@@ -531,6 +569,8 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
 /* Lock for GC. */
 DEFINE_SRCU(tomoyo_ss);
 
+int tomoyo_enabled __lsm_ro_after_init = 1;
+
 /**
  * tomoyo_init - Register TOMOYO Linux as a LSM module.
  *
@@ -538,19 +578,23 @@ DEFINE_SRCU(tomoyo_ss);
  */
 static int __init tomoyo_init(void)
 {
-       struct cred *cred = (struct cred *) current_cred();
+       struct tomoyo_task *s = tomoyo_task(current);
 
-       if (!security_module_enable("tomoyo"))
-               return 0;
        /* register ourselves with the security framework */
        security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo");
-       printk(KERN_INFO "TOMOYO Linux initialized\n");
-       cred->security = &tomoyo_kernel_domain;
+       pr_info("TOMOYO Linux initialized\n");
+       s->domain_info = &tomoyo_kernel_domain;
+       atomic_inc(&tomoyo_kernel_domain.users);
+       s->old_domain_info = NULL;
        tomoyo_mm_init();
+
        return 0;
 }
 
 DEFINE_LSM(tomoyo) = {
        .name = "tomoyo",
+       .enabled = &tomoyo_enabled,
+       .flags = LSM_FLAG_LEGACY_MAJOR,
+       .blobs = &tomoyo_blob_sizes,
        .init = tomoyo_init,
 };
index badffc8271c8f0f4ea1b5a37e98d84f46e2bb642..0517cbdd72756eb35e95e797edc607a182f6e01e 100644 (file)
@@ -91,6 +91,7 @@ const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
 void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
 {
        struct tm tm;
+
        time64_to_tm(time64, 0, &tm);
        stamp->sec = tm.tm_sec;
        stamp->min = tm.tm_min;
@@ -113,6 +114,7 @@ void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
 bool tomoyo_permstr(const char *string, const char *keyword)
 {
        const char *cp = strstr(string, keyword);
+
        if (cp)
                return cp == string || *(cp - 1) == '/';
        return false;
@@ -132,6 +134,7 @@ char *tomoyo_read_token(struct tomoyo_acl_param *param)
 {
        char *pos = param->data;
        char *del = strchr(pos, ' ');
+
        if (del)
                *del++ = '\0';
        else
@@ -152,6 +155,7 @@ const struct tomoyo_path_info *tomoyo_get_domainname
 {
        char *start = param->data;
        char *pos = start;
+
        while (*pos) {
                if (*pos++ != ' ' || *pos++ == '/')
                        continue;
@@ -181,8 +185,10 @@ u8 tomoyo_parse_ulong(unsigned long *result, char **str)
        const char *cp = *str;
        char *ep;
        int base = 10;
+
        if (*cp == '0') {
                char c = *(cp + 1);
+
                if (c == 'x' || c == 'X') {
                        base = 16;
                        cp += 2;
@@ -240,6 +246,7 @@ bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr)
 {
        char *filename;
+
        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
@@ -266,6 +273,7 @@ bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
        char *data;
        u8 type;
        unsigned long v;
+
        memset(ptr, 0, sizeof(*ptr));
        if (param->data[0] == '@') {
                param->data++;
@@ -429,6 +437,7 @@ static bool tomoyo_correct_word2(const char *string, size_t len)
        unsigned char c;
        unsigned char d;
        unsigned char e;
+
        if (!len)
                goto out;
        while (len--) {
@@ -533,6 +542,7 @@ bool tomoyo_correct_domain(const unsigned char *domainname)
                return true;
        while (1) {
                const unsigned char *cp = strchr(domainname, ' ');
+
                if (!cp)
                        break;
                if (*domainname != '/' ||
@@ -554,6 +564,7 @@ bool tomoyo_domain_def(const unsigned char *buffer)
 {
        const unsigned char *cp;
        int len;
+
        if (*buffer != '<')
                return false;
        cp = strchr(buffer, ' ');
@@ -668,6 +679,9 @@ static bool tomoyo_file_matches_pattern2(const char *filename,
 {
        while (filename < filename_end && pattern < pattern_end) {
                char c;
+               int i;
+               int j;
+
                if (*pattern != '\\') {
                        if (*filename++ != *pattern++)
                                return false;
@@ -676,8 +690,6 @@ static bool tomoyo_file_matches_pattern2(const char *filename,
                c = *filename;
                pattern++;
                switch (*pattern) {
-                       int i;
-                       int j;
                case '?':
                        if (c == '/') {
                                return false;
@@ -985,6 +997,7 @@ int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain, const u8 index)
 {
        u8 profile;
+
        memset(r, 0, sizeof(*r));
        if (!domain)
                domain = tomoyo_domain();
@@ -1018,6 +1031,7 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
        list_for_each_entry_rcu(ptr, &domain->acl_info_list, list) {
                u16 perm;
                u8 i;
+
                if (ptr->is_deleted)
                        continue;
                switch (ptr->type) {
@@ -1062,9 +1076,8 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
                domain->flags[TOMOYO_DIF_QUOTA_WARNED] = true;
                /* r->granted = false; */
                tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
-               printk(KERN_WARNING "WARNING: "
-                      "Domain '%s' has too many ACLs to hold. "
-                      "Stopped learning mode.\n", domain->domainname->name);
+               pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
+                       domain->domainname->name);
        }
        return false;
 }
index 02514fe558b416be1fd2bc407b01a6fb1e0b59b8..57cc60722dd3855021c56a3e46d900e1f0ad0efe 100644 (file)
@@ -479,9 +479,15 @@ static void __init yama_init_sysctl(void)
 static inline void yama_init_sysctl(void) { }
 #endif /* CONFIG_SYSCTL */
 
-void __init yama_add_hooks(void)
+static int __init yama_init(void)
 {
        pr_info("Yama: becoming mindful.\n");
        security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama");
        yama_init_sysctl();
+       return 0;
 }
+
+DEFINE_LSM(yama) = {
+       .name = "yama",
+       .init = yama_init,
+};
index ecc14d68e101338a46b55d0f4760932fea6141b5..908de689a902a2bda5438bd3de35d1018f126d03 100644 (file)
@@ -25,7 +25,7 @@ unsigned long long clock_frequency;
 unsigned long long timebase_frequency;
 double timebase_multiplier;
 
-static inline unsigned long long mftb(void)
+static inline unsigned long mftb(void)
 {
        unsigned long low;
 
index 52b4710469d23a840813fa5fc5751fd967e436a2..96043b9b9829e996a610af3c0708312ccc8d39be 100644 (file)
 #define TEXASR_TE      0x0000000004000000
 #define TEXASR_ROT     0x0000000002000000
 
+/* MSR register bits */
+#define MSR_TS_S_LG     33              /* Trans Mem state: Suspended */
+
+#define __MASK(X)       (1UL<<(X))
+
+/* macro to check TM MSR bits */
+#define MSR_TS_S        __MASK(MSR_TS_S_LG)   /* Transaction Suspended */
+
 /* Vector Instructions */
 #define VSX_XX1(xs, ra, rb)    (((xs) & 0x1f) << 21 | ((ra) << 16) |  \
                                 ((rb) << 11) | (((xs) >> 5)))
index ae43a614835de78756efdc60db92668575361adf..7636bf45d5d5eb0bab70dbbd7d50d508a40498c2 100644 (file)
@@ -102,8 +102,10 @@ do {                                                               \
 
 #if defined(__powerpc64__)
 #define UCONTEXT_NIA(UC)       (UC)->uc_mcontext.gp_regs[PT_NIP]
+#define UCONTEXT_MSR(UC)       (UC)->uc_mcontext.gp_regs[PT_MSR]
 #elif defined(__powerpc__)
 #define UCONTEXT_NIA(UC)       (UC)->uc_mcontext.uc_regs->gregs[PT_NIP]
+#define UCONTEXT_MSR(UC)       (UC)->uc_mcontext.uc_regs->gregs[PT_MSR]
 #else
 #error implement UCONTEXT_NIA
 #endif
index 167135bd92a8e61e9324fdf7523e1d0fe049a52e..af1b8026507650e8bd22c43588d2fd8159727157 100644 (file)
@@ -11,7 +11,6 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include <setjmp.h>
-#include <signal.h>
 
 #include "ebb.h"
 
index 208452a93e2ca8c2567a29b9873812b66dc08fac..951fe855f7cd465a03dc1f03b344bbaa0d8f9264 100644 (file)
@@ -11,6 +11,7 @@ tm-signal-context-chk-fpu
 tm-signal-context-chk-gpr
 tm-signal-context-chk-vmx
 tm-signal-context-chk-vsx
+tm-signal-context-force-tm
 tm-signal-sigreturn-nt
 tm-vmx-unavail
 tm-unavailable
index 75a6853591294d3fe07f0cc85029389b567b7698..c0734ed0ef56bb67ed9330ff2f84462759aca97a 100644 (file)
@@ -4,7 +4,8 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu
 
 TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
        tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
-       $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt
+       $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
+       tm-signal-context-force-tm
 
 top_srcdir = ../../../../..
 include ../../lib.mk
@@ -20,6 +21,7 @@ $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64
 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.c
 $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx
 $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64
+$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64
 
 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS))
 $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
new file mode 100644 (file)
index 0000000..3171762
--- /dev/null
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp.
+ *
+ * This test raises a SIGUSR1 signal, and toggle the MSR[TS]
+ * fields at the signal handler. With MSR[TS] being set, the kernel will
+ * force a recheckpoint, which may cause a segfault when returning to
+ * user space. Since the test needs to re-run, the segfault needs to be
+ * caught and handled.
+ *
+ * In order to continue the test even after a segfault, the context is
+ * saved prior to the signal being raised, and it is restored when there is
+ * a segmentation fault. This happens for COUNT_MAX times.
+ *
+ * This test never fails (as returning EXIT_FAILURE). It either succeeds,
+ * or crash the kernel (on a buggy kernel).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <ucontext.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "tm.h"
+#include "utils.h"
+#include "reg.h"
+
+#define COUNT_MAX       5000           /* Number of interactions */
+
+/*
+ * This test only runs on 64 bits system. Unsetting MSR_TS_S to avoid
+ * compilation issue on 32 bits system. There is no side effect, since the
+ * whole test will be skipped if it is not running on 64 bits system.
+ */
+#ifndef __powerpc64__
+#undef  MSR_TS_S
+#define MSR_TS_S       0
+#endif
+
+/* Setting contexts because the test will crash and we want to recover */
+ucontext_t init_context, main_context;
+
+static int count, first_time;
+
+void usr_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+       ucontext_t *ucp = uc;
+       int ret;
+
+       /*
+        * Allocating memory in a signal handler, and never freeing it on
+        * purpose, forcing the heap increase, so, the memory leak is what
+        * we want here.
+        */
+       ucp->uc_link = mmap(NULL, sizeof(ucontext_t),
+                           PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+       if (ucp->uc_link == (void *)-1) {
+               perror("Mmap failed");
+               exit(-1);
+       }
+
+       /* Forcing the page to be allocated in a page fault */
+       ret = madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED);
+       if (ret) {
+               perror("madvise failed");
+               exit(-1);
+       }
+
+       memcpy(&ucp->uc_link->uc_mcontext, &ucp->uc_mcontext,
+               sizeof(ucp->uc_mcontext));
+
+       /* Forcing to enable MSR[TM] */
+       UCONTEXT_MSR(ucp) |= MSR_TS_S;
+
+       /*
+        * A fork inside a signal handler seems to be more efficient than a
+        * fork() prior to the signal being raised.
+        */
+       if (fork() == 0) {
+               /*
+                * Both child and parent will return, but, child returns
+                * with count set so it will exit in the next segfault.
+                * Parent will continue to loop.
+                */
+               count = COUNT_MAX;
+       }
+
+       /*
+        * If the change above does not hit the bug, it will cause a
+        * segmentation fault, since the ck structures are NULL.
+        */
+}
+
+void seg_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+       if (count == COUNT_MAX) {
+               /* Return to tm_signal_force_msr() and exit */
+               setcontext(&main_context);
+       }
+
+       count++;
+
+       /* Reexecute the test */
+       setcontext(&init_context);
+}
+
+void tm_trap_test(void)
+{
+       struct sigaction usr_sa, seg_sa;
+       stack_t ss;
+
+       usr_sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+       usr_sa.sa_sigaction = usr_signal_handler;
+
+       seg_sa.sa_flags = SA_SIGINFO;
+       seg_sa.sa_sigaction = seg_signal_handler;
+
+       /*
+        * Set initial context. Will get back here from
+        * seg_signal_handler()
+        */
+       getcontext(&init_context);
+
+       /* Allocated an alternative signal stack area */
+       ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+       ss.ss_size = SIGSTKSZ;
+       ss.ss_flags = 0;
+
+       if (ss.ss_sp == (void *)-1) {
+               perror("mmap error\n");
+               exit(-1);
+       }
+
+       /* Force the allocation through a page fault */
+       if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) {
+               perror("madvise\n");
+               exit(-1);
+       }
+
+       /* Setting an alternative stack to generate a page fault when
+        * the signal is raised.
+        */
+       if (sigaltstack(&ss, NULL)) {
+               perror("sigaltstack\n");
+               exit(-1);
+       }
+
+       /* The signal handler will enable MSR_TS */
+       sigaction(SIGUSR1, &usr_sa, NULL);
+       /* If it does not crash, it will segfault, avoid it to retest */
+       sigaction(SIGSEGV, &seg_sa, NULL);
+
+       raise(SIGUSR1);
+}
+
+int tm_signal_context_force_tm(void)
+{
+       SKIP_IF(!have_htm());
+       /*
+        * Skipping if not running on 64 bits system, since I think it is
+        * not possible to set mcontext's [MSR] with TS, due to it being 32
+        * bits.
+        */
+       SKIP_IF(!is_ppc64le());
+
+       /* Will get back here after COUNT_MAX interactions */
+       getcontext(&main_context);
+
+       if (!first_time++)
+               tm_trap_test();
+
+       return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+       test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm");
+}
diff --git a/tools/testing/selftests/safesetid/.gitignore b/tools/testing/selftests/safesetid/.gitignore
new file mode 100644 (file)
index 0000000..9c1a629
--- /dev/null
@@ -0,0 +1 @@
+safesetid-test
diff --git a/tools/testing/selftests/safesetid/Makefile b/tools/testing/selftests/safesetid/Makefile
new file mode 100644 (file)
index 0000000..98da7a5
--- /dev/null
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -Wall -lcap -O2
+
+TEST_PROGS := run_tests.sh
+TEST_GEN_FILES := safesetid-test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/safesetid/config b/tools/testing/selftests/safesetid/config
new file mode 100644 (file)
index 0000000..9d44e5c
--- /dev/null
@@ -0,0 +1,2 @@
+CONFIG_SECURITY=y
+CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c
new file mode 100644 (file)
index 0000000..892c8e8
--- /dev/null
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <pwd.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifndef CLONE_NEWUSER
+# define CLONE_NEWUSER 0x10000000
+#endif
+
+#define ROOT_USER 0
+#define RESTRICTED_PARENT 1
+#define ALLOWED_CHILD1 2
+#define ALLOWED_CHILD2 3
+#define NO_POLICY_USER 4
+
+char* add_whitelist_policy_file = "/sys/kernel/security/safesetid/add_whitelist_policy";
+
+static void die(char *fmt, ...)
+{
+       va_list ap;
+       va_start(ap, fmt);
+       vfprintf(stderr, fmt, ap);
+       va_end(ap);
+       exit(EXIT_FAILURE);
+}
+
+static bool vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+       char buf[4096];
+       int fd;
+       ssize_t written;
+       int buf_len;
+
+       buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+       if (buf_len < 0) {
+               printf("vsnprintf failed: %s\n",
+                   strerror(errno));
+               return false;
+       }
+       if (buf_len >= sizeof(buf)) {
+               printf("vsnprintf output truncated\n");
+               return false;
+       }
+
+       fd = open(filename, O_WRONLY);
+       if (fd < 0) {
+               if ((errno == ENOENT) && enoent_ok)
+                       return true;
+               return false;
+       }
+       written = write(fd, buf, buf_len);
+       if (written != buf_len) {
+               if (written >= 0) {
+                       printf("short write to %s\n", filename);
+                       return false;
+               } else {
+                       printf("write to %s failed: %s\n",
+                               filename, strerror(errno));
+                       return false;
+               }
+       }
+       if (close(fd) != 0) {
+               printf("close of %s failed: %s\n",
+                       filename, strerror(errno));
+               return false;
+       }
+       return true;
+}
+
+static bool write_file(char *filename, char *fmt, ...)
+{
+       va_list ap;
+       bool ret;
+
+       va_start(ap, fmt);
+       ret = vmaybe_write_file(false, filename, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+
+static void ensure_user_exists(uid_t uid)
+{
+       struct passwd p;
+
+       FILE *fd;
+       char name_str[10];
+
+       if (getpwuid(uid) == NULL) {
+               memset(&p,0x00,sizeof(p));
+               fd=fopen("/etc/passwd","a");
+               if (fd == NULL)
+                       die("couldn't open file\n");
+               if (fseek(fd, 0, SEEK_END))
+                       die("couldn't fseek\n");
+               snprintf(name_str, 10, "%d", uid);
+               p.pw_name=name_str;
+               p.pw_uid=uid;
+               p.pw_gecos="Test account";
+               p.pw_dir="/dev/null";
+               p.pw_shell="/bin/false";
+               int value = putpwent(&p,fd);
+               if (value != 0)
+                       die("putpwent failed\n");
+               if (fclose(fd))
+                       die("fclose failed\n");
+       }
+}
+
+static void ensure_securityfs_mounted(void)
+{
+       int fd = open(add_whitelist_policy_file, O_WRONLY);
+       if (fd < 0) {
+               if (errno == ENOENT) {
+                       // Need to mount securityfs
+                       if (mount("securityfs", "/sys/kernel/security",
+                                               "securityfs", 0, NULL) < 0)
+                               die("mounting securityfs failed\n");
+               } else {
+                       die("couldn't find securityfs for unknown reason\n");
+               }
+       } else {
+               if (close(fd) != 0) {
+                       die("close of %s failed: %s\n",
+                               add_whitelist_policy_file, strerror(errno));
+               }
+       }
+}
+
+static void write_policies(void)
+{
+       ssize_t written;
+       int fd;
+
+       fd = open(add_whitelist_policy_file, O_WRONLY);
+       if (fd < 0)
+               die("cant open add_whitelist_policy file\n");
+       written = write(fd, "1:2", strlen("1:2"));
+       if (written != strlen("1:2")) {
+               if (written >= 0) {
+                       die("short write to %s\n", add_whitelist_policy_file);
+               } else {
+                       die("write to %s failed: %s\n",
+                               add_whitelist_policy_file, strerror(errno));
+               }
+       }
+       written = write(fd, "1:3", strlen("1:3"));
+       if (written != strlen("1:3")) {
+               if (written >= 0) {
+                       die("short write to %s\n", add_whitelist_policy_file);
+               } else {
+                       die("write to %s failed: %s\n",
+                               add_whitelist_policy_file, strerror(errno));
+               }
+       }
+       if (close(fd) != 0) {
+               die("close of %s failed: %s\n",
+                       add_whitelist_policy_file, strerror(errno));
+       }
+}
+
+static bool test_userns(bool expect_success)
+{
+       uid_t uid;
+       char map_file_name[32];
+       size_t sz = sizeof(map_file_name);
+       pid_t cpid;
+       bool success;
+
+       uid = getuid();
+
+       int clone_flags = CLONE_NEWUSER;
+       cpid = syscall(SYS_clone, clone_flags, NULL);
+       if (cpid == -1) {
+           printf("clone failed");
+           return false;
+       }
+
+       if (cpid == 0) {        /* Code executed by child */
+               // Give parent 1 second to write map file
+               sleep(1);
+               exit(EXIT_SUCCESS);
+       } else {                /* Code executed by parent */
+               if(snprintf(map_file_name, sz, "/proc/%d/uid_map", cpid) < 0) {
+                       printf("preparing file name string failed");
+                       return false;
+               }
+               success = write_file(map_file_name, "0 0 1", uid);
+               return success == expect_success;
+       }
+
+       printf("should not reach here");
+       return false;
+}
+
+static void test_setuid(uid_t child_uid, bool expect_success)
+{
+       pid_t cpid, w;
+       int wstatus;
+
+       cpid = fork();
+       if (cpid == -1) {
+               die("fork\n");
+       }
+
+       if (cpid == 0) {            /* Code executed by child */
+               setuid(child_uid);
+               if (getuid() == child_uid)
+                       exit(EXIT_SUCCESS);
+               else
+                       exit(EXIT_FAILURE);
+       } else {                 /* Code executed by parent */
+               do {
+                       w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED);
+                       if (w == -1) {
+                               die("waitpid\n");
+                       }
+
+                       if (WIFEXITED(wstatus)) {
+                               if (WEXITSTATUS(wstatus) == EXIT_SUCCESS) {
+                                       if (expect_success) {
+                                               return;
+                                       } else {
+                                               die("unexpected success\n");
+                                       }
+                               } else {
+                                       if (expect_success) {
+                                               die("unexpected failure\n");
+                                       } else {
+                                               return;
+                                       }
+                               }
+                       } else if (WIFSIGNALED(wstatus)) {
+                               if (WTERMSIG(wstatus) == 9) {
+                                       if (expect_success)
+                                               die("killed unexpectedly\n");
+                                       else
+                                               return;
+                               } else {
+                                       die("unexpected signal: %d\n", wstatus);
+                               }
+                       } else {
+                               die("unexpected status: %d\n", wstatus);
+                       }
+               } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus));
+       }
+
+       die("should not reach here\n");
+}
+
+static void ensure_users_exist(void)
+{
+       ensure_user_exists(ROOT_USER);
+       ensure_user_exists(RESTRICTED_PARENT);
+       ensure_user_exists(ALLOWED_CHILD1);
+       ensure_user_exists(ALLOWED_CHILD2);
+       ensure_user_exists(NO_POLICY_USER);
+}
+
+static void drop_caps(bool setid_retained)
+{
+       cap_value_t cap_values[] = {CAP_SETUID, CAP_SETGID};
+       cap_t caps;
+
+       caps = cap_get_proc();
+       if (setid_retained)
+               cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET);
+       else
+               cap_clear(caps);
+       cap_set_proc(caps);
+       cap_free(caps);
+}
+
+int main(int argc, char **argv)
+{
+       ensure_users_exist();
+       ensure_securityfs_mounted();
+       write_policies();
+
+       if (prctl(PR_SET_KEEPCAPS, 1L))
+               die("Error with set keepcaps\n");
+
+       // First test to make sure we can write userns mappings from a user
+       // that doesn't have any restrictions (as long as it has CAP_SETUID);
+       setuid(NO_POLICY_USER);
+       setgid(NO_POLICY_USER);
+
+       // Take away all but setid caps
+       drop_caps(true);
+
+       // Need PR_SET_DUMPABLE flag set so we can write /proc/[pid]/uid_map
+       // from non-root parent process.
+       if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
+               die("Error with set dumpable\n");
+
+       if (!test_userns(true)) {
+               die("test_userns failed when it should work\n");
+       }
+
+       setuid(RESTRICTED_PARENT);
+       setgid(RESTRICTED_PARENT);
+
+       test_setuid(ROOT_USER, false);
+       test_setuid(ALLOWED_CHILD1, true);
+       test_setuid(ALLOWED_CHILD2, true);
+       test_setuid(NO_POLICY_USER, false);
+
+       if (!test_userns(false)) {
+               die("test_userns worked when it should fail\n");
+       }
+
+       // Now take away all caps
+       drop_caps(false);
+       test_setuid(2, false);
+       test_setuid(3, false);
+       test_setuid(4, false);
+
+       // NOTE: this test doesn't clean up users that were created in
+       // /etc/passwd or flush policies that were added to the LSM.
+       return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/safesetid/safesetid-test.sh b/tools/testing/selftests/safesetid/safesetid-test.sh
new file mode 100755 (executable)
index 0000000..e4fdce6
--- /dev/null
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+TCID="safesetid-test.sh"
+errcode=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_root()
+{
+       uid=$(id -u)
+       if [ $uid -ne 0 ]; then
+               echo $TCID: must be run as root >&2
+               exit $ksft_skip
+       fi
+}
+
+main_function()
+{
+  check_root
+  ./safesetid-test
+}
+
+main_function
+echo "$TCID: done"
+exit $errcode
index 9b777fa95f090e5bb811e16ae90021a84927e355..5a2d7b8efc407bcbf2b6664ed5a1e596a5b2c679 100644 (file)
 #define MAP_HUGETLB 0x40000 /* arch specific */
 #endif
 
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_MASK
+#define MAP_HUGE_MASK 0x3f
+#endif
+
 /* Only ia64 requires this */
 #ifdef __ia64__
 #define ADDR (void *)(0x8000000000000000UL)
@@ -58,12 +66,29 @@ static int read_bytes(char *addr)
        return 0;
 }
 
-int main(void)
+int main(int argc, char **argv)
 {
        void *addr;
        int ret;
+       size_t length = LENGTH;
+       int flags = FLAGS;
+       int shift = 0;
+
+       if (argc > 1)
+               length = atol(argv[1]) << 20;
+       if (argc > 2) {
+               shift = atoi(argv[2]);
+               if (shift)
+                       flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+       }
+
+       if (shift)
+               printf("%u kB hugepages\n", 1 << shift);
+       else
+               printf("Default size hugepages\n");
+       printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
 
-       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0);
+       addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
        if (addr == MAP_FAILED) {
                perror("mmap");
                exit(1);